Source code for phonlab.acoustic.shs

from ..utils.prep_audio_ import prep_audio

import numpy as np
from librosa import util
from scipy.interpolate import interp1d
from numpy.fft import rfft, rfftfreq
from pandas import DataFrame
import matplotlib.pyplot as plt

def _maxarg(x,axis = -1):
    try:
        idx = np.argmax(x,axis=axis)
        val = np.max(x,axis=axis)
    except:
        idx = 0
        val = np.nan
        
    return idx,val

[docs] def get_f0_shs(x,fs, f0_range=[50,400], l=0.06, s=0.005, shr_threshold = 0.3, target_time=None): ''' A pitch determination function implementing the subharmonic summation (SHS) algorithm proposed by Dik Hermes (1988), with a method of finding the relative amplitude of a subharmonic pitch (if there is evidence of a subharmonic) as suggested by Xuejing Sun (2002). The method is good at tracking pitch changes when the voice goes into creak, and is relatively insensitive to the setting of the f0_range parameter. If you find that there is pitch halving, try increasing the shr_threshold, and if you find pitch doubling try decreasing shr_threshold and/or the top end of the f0_range. Parameters ========== x : ndarray A one-dimensional array of audio samples fs : int Sampling rate of **x** f0_range : an array of two numbers, default=[50,400] The expected range for f0. There is rarely any need to adjust the lower value. l : float, default = 0.06 Length of analysis windows. The default is 60 milliseconds. s : float, default = 0.005 Step size, of hops between analysis windows. The default is 5 milliseconds. shr_threshold : float, default = 0.3 A value which determines how sensitive the algorithm is in deciding that a subharmonic component is present in the voicing spectrum. target_time : float, default = None If a time value (in seconds) is given, a diagnostic figure comparable to Sun's Figure 1 will be produced for the frame at time target_time. Returns ======= df: pandas DataFrame measurements at (by default) 5 msec intervals. Note ==== The columns in the returned dataframe are for each frame of audio: * sec - time at the midpoint of each frame in seconds * f0 - estimate of the fundamental frequency in Hz * rms - rms amplitude in the frequency band from 0 to fs/2 * shr - The Subharmonic-to-harmonic Ratio. If the ratio is low the subharmonic energy is close to that of the harmonic. References ========== D. Hermes (1988) Measurement of pitch by subharmonic summation. `J. Acoust. Soc. Am.`, 83(1),257-264. X. Sun (2002) Pitch determination and voice quality analysis using subharmonic-to-harmonic ratio. `Proceedings of ICASSP2002` I-333 - I-336. Example ======= .. code-block:: Python example_file = importlib.resources.files('phonlab') / 'data/example_audio/sf3_cln.wav' x,fs = phon.loadsig(example_file,chansel=[0]) ttime = 2.101 df = phon.get_f0_shs(x,fs, shr_threshold=0.2, target_time=2.1) This example shows diagnostic plots from the get_f0_shs() function. The top left panel shows the spectrum at a particular time in the audio file on a log2 scale. The top right and bottom left panels show harmonic and subharmonic summation spectra which are used to produce the difference spectrum in the bottom right. The maximum of the difference spectra is taken as the pitch in this frame. .. figure:: images/shs.png :scale: 60 % :alt: Diagnostic plots of the log spectrum to 1200Hz, and the harmonic and submonic spectra, and in the bottom left panel shows the difference between them. :align: center Diagnostic plots from shr_pitch(). .. figure:: images/shs_pitch.png :scale: 35 % :alt: Pitch trace produced by the get_f0_shs() function. :align: center Pitch trace produced by get_f0_shs() function. ''' y,fs = prep_audio(x,fs,target_fs=16000,quiet=True) top_freq = f0_range[1]*2.5 frame_length = round(l*fs) half_frame = frame_length//2 step = round(s*fs) if target_time != None: # ---- for the diagnostic print ------- fn = int(((target_time * fs)-half_frame)/step) # frame number else: fn = None NFFT = int(2**(np.ceil(np.log(frame_length*(2))/np.log(2)))) freq = rfftfreq(NFFT, d=1./fs) limit = np.argmin(np.abs(freq-top_freq))+1 freq = freq[1:limit] # frequency values in the linear spectrum logf = np.log2(freq) # frequency values in the log2 spectrum minbin = logf[-1] - logf[-2] # minimum step size on the log freq scale ilogf = np.arange(logf[0],logf[-1],minbin) # equal spacing on the log scale ifreq = 2**ilogf maxlogf = np.log2(f0_range[1]) minlogf = np.log2(f0_range[0]) upperbound = np.argmin(np.abs(ilogf-maxlogf)) lowerbound = np.argmin(np.abs(ilogf-minlogf)) N = int(top_freq/f0_range[0]) # number of harmonics to consider N = N - N%2 # make sure it is an even number frames = util.frame(y,frame_length=frame_length, hop_length=step,axis=0) nb = frames.shape[0] f = frames.shape[1] # number of frequency steps w = np.blackman(frame_length) S = np.abs(rfft(w*frames,NFFT)) # spectrogram below topfreq Sxx = S[:,1:limit] rms = 20 * np.log10(np.sqrt(np.sum(np.square(np.abs(Sxx)),axis=-1))) interp_function = interp1d(logf, Sxx) logS = interp_function(ilogf) W = 0.5 + (1/np.pi)*np.arctan(5*(ilogf-np.log2(60))) # W function (Hermes, 1988) to damp freqs below 60 logS = W * logS #logS -= np.min(logS,axis=0) s_points = logS.shape[1] starts = np.round(np.log2(range(1,N+1,1))/minbin).astype(np.int16) starts2 = np.round(np.log2(np.arange(1.5,N+1,1))/minbin).astype(np.int16) shift_mat = np.zeros((nb,s_points,N)) shift_mat2 = np.zeros((nb,s_points,N)) for i in range(N): shift_mat[:,:s_points-starts[i],i] = logS[:,starts[i]:] shift_mat2[:,:s_points-starts2[i],i] = logS[:,starts2[i]:] Hxx = np.sum(shift_mat,axis=-1) # subharmonic summation Pxx = np.sum(shift_mat2,axis=-1) # shifted subharmonic summation Dxx = Hxx - Pxx idx1 = np.argmax(Dxx[:,lowerbound:upperbound],axis= -1) idx1 += lowerbound mag1 = np.max(Hxx[:,idx1],axis=-1) mag1 = np.where(mag1>0,mag1,0.00001) # avoid 0/0 in SHR calculation # consider the possibility that idx1 is a subharmonic - look for harmonic peak at log2(f)+log2(1) step1 = round(np.log2(2 - 0.0625)/minbin) # search bounds for later, with subharmonics step2 = round(np.log2(2 + 0.0625)/minbin) s = idx1 + step1 # where to find the possible harmonic peak e = idx1 + step2 top = min(s_points,upperbound) e = np.where(e>top,top,e) idx2,mag2 = np.array([_maxarg(Hxx[i,s[i]:e[i]]) for i in range(nb)]).T idx2 = (idx2 + s).astype(np.int16) idx2 = np.where(s>upperbound,idx1,idx2) mag2 = np.where(s>upperbound,mag1,mag2) mag2 = np.where(mag2<=0,mag1,mag2) SHR = (mag1-mag2)/(mag1+mag2) SHR = np.where(SHR==0,np.nan,SHR) f0idx = np.where(SHR<shr_threshold,idx2,idx1) F0 = ifreq[f0idx] ts = (np.array(range(nb)) * step + half_frame)/fs # time axis for output if fn!=None: fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2,ncols=2) ax1.plot(ilogf,logS[fn,:],color="orange") ax2.plot(ilogf[lowerbound:upperbound],Hxx[fn,lowerbound:upperbound],color='black') ax3.plot(ilogf[lowerbound:upperbound],Pxx[fn,lowerbound:upperbound],color='black') ax3.plot(ilogf[lowerbound:upperbound],Hxx[fn,lowerbound:upperbound],color='black',linestyle="dotted") ax4.plot(ilogf[lowerbound:upperbound],Dxx[fn,lowerbound:upperbound],color='blue') ax2.axvline(ilogf[idx1[fn]],color="green",linestyle="dotted") ax2.axvline(ilogf[idx2[fn]],color="red",linestyle="dotted") ax3.axvline(ilogf[idx1[fn]],color="green",linestyle="dotted") ax3.axvline(ilogf[idx2[fn]],color="red",linestyle="dotted") ax4.axvline(ilogf[idx1[fn]],color="green",linestyle="dotted") ax4.axvline(ilogf[idx2[fn]],color="red",linestyle="dotted") print(f"F0={F0[fn]:0.1f}, shr={SHR[fn]:0.3f}, N={N}") print(f" idx1={ifreq[idx1[fn]]:0.1f}, idx2={ifreq[idx2[fn]]:0.1f}, shr = {SHR[fn]:0.3f}") print(f" mag1={mag1[fn]:0.3f}, mag2={mag2[fn]:0.3f}") return DataFrame({'sec': ts, 'f0': F0, 'rms':rms, 'shr': SHR})
def _shr_pitch(x,fs, f0_range=[64,400], l=0.04, s=0.005, shr_threshold = 0.15, top_freq = 1000, target_time=None): '''An implementation of Xuejing Sun's (2002) SubHarmonic-to-Harmonic Ratio (SHR) pitch determination method. This method is a variant of Hermes' (1988) subharmonic summation method. In addition to computing pitch, the function returns the SHR when there is a measureable subhamonic component, otherwise the value is NaN. Sun found that between 40% and 90% of all voiced frames (depending on the speaker) had no measurable SHR. The algorithm as implemented here is three times faster than the fastest of the other pitch tracking algorithms in `phonlab` -- (the autocorrelation pitch tracker in `phonlab.get_f0`, and the cepstral peak pitch tracker in `phonlab.CPP`. It produces very accurate pitch determination, if the top of the F0 pitch range is chosen appropriately for the talker. Parameters ========== x : ndarray A one-dimensional array of audio samples fs : int Sampling rate of **x** f0_range : an array of two numbers, default=[64,400] The expected range for f0. Choose the top value deliberately for the speaker being analyzed. l : float, default = 0.04 Length of analysis windows. The default is 40 milliseconds. s : float, default = 0.005 Step size, of hops between analysis windows. The default is 5 milliseconds. shr_threshold : float, default = 0.15 A value which determines how sensitive the algorithm is in deciding that a subharmonic component is present in the voicing spectrum. top_frequency : int, default = 1000 (Hz) The spectrum will be limited to this top frequency target_time : float, default = None If a time value (in seconds) is given, a diagnostic figure comparable to Sun's Figure 1 will be produced for the frame at time target_time. Returns ======= df: pandas DataFrame measurements at (by default) 5 msec intervals. Note ==== The columns in the returned dataframe are for each frame of audio: * sec - time at the midpoint of each frame in seconds * f0 - estimate of the fundamental frequency in Hz * shr - The Subharmonic-to-harmonic Ratio. If the ratio is low the subharmonic energy is close to that in the harmonic spectrum. References ========== D. Hermes (1988) Measurement of pitch by subharmonic summation. `J. Acoust. Soc. Am.`, 83(1),257-264. X. Sun (2002) Pitch determination and voice quality analysis using subharmonic-to-harmonic ratio. `Proceedings of ICASSP2002` I-333 - I-336. Example ======= .. code-block:: Python example_file = importlib.resources.files('phonlab') / 'data/example_audio/sf3_cln.wav' x,fs = phon.loadsig(example_file,chansel=[0]) df = phon.shr_pitch(x,fs, f0_range=[50,300], target_time=1.33) This example shows diagnostic plots from the shr_pitch() function. The top left panel shows the spectrum at a particular time in the audio file on a log2 scale. The top right and bottom left panels show harmonic and subharmonic summation spectra which are used to produce the difference spectrum in the bottom right. The maximum of the difference spectra is taken as the pitch in this frame. .. figure:: images/shr.png :scale: 60 % :alt: Diagnostic plots of the log spectrum to 1000Hz, and the harmonic and submonic spectra, and in the bottom left panel shows the difference between them. :align: center Diagnostic plots from shr_pitch(). .. figure:: images/shr_pitch.png :scale: 45 % :alt: Pitch trace produced by the shr_pitch() function. :align: center Pitch trace produced by shr_pitch() function. ''' y,fs = prep_audio(x,fs,target_fs=16000,quiet=True) frame_length = round(l*fs) half_frame = frame_length//2 step = round(s*fs) if target_time != None: # ---- for the diagnostic print ------- fn = int(((target_time * fs)-half_frame)/step) # frame number else: fn = None NFFT = int(2**(np.ceil(np.log(frame_length*(1.5))/np.log(2)))) freq = rfftfreq(NFFT, d=1./fs) limit = np.argmin(np.abs(freq-top_freq))+1 freq = freq[1:limit] # frequency values in the linear spectrum logf = np.log2(freq) # frequency values in the log2 spectrum minbin = logf[-1] - logf[-2] # minimum step size on the log freq scale ilogf = np.arange(logf[0],logf[-1],minbin) # equal spacing on the log scale ifreq = 2**ilogf maxlogf = np.log2(f0_range[1]/2) minlogf = np.log2(f0_range[0]/2) upperbound = np.argmin(np.abs(ilogf-maxlogf)) lowerbound = np.argmin(np.abs(ilogf-minlogf)) N = int(top_freq/f0_range[0]) # number of harmonics to consider N = N - N%2 # make sure it is an even number frames = util.frame(y,frame_length=frame_length, hop_length=step,axis=0) nb = frames.shape[0] SHR = np.ones(nb) * np.nan F0 = np.ones(nb) * np.nan w = np.blackman(frame_length) S = np.abs(rfft(w*frames,NFFT))[:,1:limit] # spectrogram below 1250Hz interp_function = interp1d(logf, S) logS = interp_function(ilogf) W = 0.5 + (1/np.pi)*np.arctan(5*(ilogf-np.log2(60))) # W function (Hermes, 1988) to damp freqs below 60 logS = W * logS logS -= np.min(logS,axis=0) s_points = logS.shape[1] # 'odd' shift matrix log2(1,3,5,...), 'even' shift matrix log2(2,4,6...) odd_starts = np.round(np.log2(range(1,N,2))/minbin).astype(np.int16) even_starts = np.round(np.log2(range(2,N+1,2))/minbin).astype(np.int16) shift_mat_odd = np.zeros((nb,s_points,N//2)) shift_mat_even = np.zeros((nb,s_points,N//2)) for i in range(N//2): shift_mat_odd[:,:s_points-odd_starts[i],i] = logS[:,odd_starts[i]:] shift_mat_even[:,:s_points-even_starts[i],i] = logS[:,even_starts[i]:] Sub = np.sum(shift_mat_odd,axis=-1) Har = np.sum(shift_mat_even,axis=-1) DA = Har-Sub idx1,mag1 = _maxarg(DA[:,lowerbound:upperbound]) idx1 += lowerbound # consider the possibility that idx1 is a subharmonic - look for harmonic peak at log2(f)+log2(1) step1 = round(np.log2(2 - 0.0625)/minbin) # search bounds for later, with subharmonics step2 = round(np.log2(2 + 0.0625)/minbin) s = idx1 + step1 # where to find the possible harmonic peak e = idx1 + step2 #top = min(s_points,upperbound) e = np.where(e>s_points,s_points,e) idx2,mag2 = np.array([_maxarg(DA[i,s[i]:e[i]]) for i in range(nb)]).T idx2 = (idx2 + s).astype(np.int16) # if start > upperbound - then idx2,mag2 are invalid - f0 = ifreq[idx1]*2, SHR = 0 # if mag1 < 0 - then f0 = np.nan, SHR = np.nan # if mag2 < 0 - then f0 = ifreq[idx1]*2, SHR = 0 mask = (s<upperbound) & (mag1>0) & (mag2>0) # constraints on calculating SHR SHR[mask] = (mag1[mask]-mag2[mask])/(mag1[mask]+mag2[mask]) SHR = np.where(mag1<=0,np.nan,SHR) SHR = np.where(mag2<=0,0,SHR) F0 = np.where(mag1>0,ifreq[idx1]*2,np.nan) F0 = np.where(SHR<shr_threshold,ifreq[idx2]*2,F0) F0 = np.where(F0>f0_range[1],F0/2, F0) ts = (np.array(range(nb)) * step + half_frame)/fs # time axis for output if fn!=None: fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2,ncols=2) ax1.plot(ilogf,logS[fn,:],color="orange") ax2.plot(ilogf,Har[fn,:],color='black') ax3.plot(ilogf,Sub[fn,:],color='black') ax3.plot(ilogf,Har[fn,:],color='gray',linestyle="dotted") ax4.plot(ilogf,DA[fn,:],color="dodgerblue") ax4.axvline(ilogf[lowerbound],linestyle="dashed") ax4.axvline(ilogf[upperbound],linestyle="dashed") ax4.axvline(ilogf[idx2[fn]],color="red",linestyle="dotted") ax4.axvline(ilogf[idx1[fn]],color="green",linestyle="dotted") print(f"SHR={SHR[fn]}, F0={F0[fn]}, N={N}, s={s[fn]}, e={e[fn]}, upperbound = {upperbound}") print(f" (idx1={ifreq[idx1[fn]]}, mag1={mag1[fn]}), (idx2={ifreq[idx2[fn]]}, mag2={mag2[fn]})") return DataFrame({'sec': ts, 'f0': F0, 'shr': SHR})