Source code for phonlab.acoustic.get_f0_

import numpy as np
from scipy.signal import windows, find_peaks, spectrogram, peak_prominences, fftconvolve
from scipy import fft
from librosa import feature, util, lpc
from pandas import DataFrame
from scipy import linalg

from ..utils.prep_audio_ import prep_audio
from ..acoustic.lpc_residual import lpcresidual

np.seterr(divide="ignore")

[docs] def get_rms(y, fs, s=0.005, l=0.04, scale=False): """Measure the time-varying root mean square (RMS) amplitude of the signal in **y**. Parameters ========== y : ndarray A one-dimensional array of audio samples fs : int Sampling rate of **x** scale : boolean, default = False optionally scale the rms amplitude to maximum peak. Returns ======= df: pandas DataFrame There are two columns in the returned frame - sec, rms. """ # constants and global variables frame_length = int(fs * l) half_frame = frame_length//2 frame_length = half_frame * 2 + 1 # odd number in frame step = int(fs * s) # number of samples between frames rms = feature.rms(y=y,frame_length=frame_length, hop_length=step,center=False)[0] if scale: rms = 20*np.log10(rms/np.max(rms),where=np.where(rms>0,True,False,out=np.zeros(rms.shape))) else: rms = 20*np.log10(rms,where=np.where(rms>0,True,False,out=np.zeros(rms.shape))) nb = rms.shape[0] # the number of frames sec = (np.array(range(nb)) * step + half_frame).astype(int)/fs return DataFrame({'sec': sec, 'rms':rms})
[docs] def get_f0(y, fs, f0_range = [63,400], s= 0.005): """Track the fundamental frequency of voicing (f0), using autocorrelation This function implements the autocorrelation method described in Boersma (1993). where the autocorrelation function is normalized by the autocorrelation of the analysis window. The raw best fit frame-by-frame values are returned -- that is, this function does not follow Boersma (1993) in using the Viterbi algorithm to choose the optimal path among f0 candidates. The Harmonics-to-Noise ratio (HNR) in each frame is estimated from the peak of the autocorreation function (c) as `10 * log10(c/(1-c))`. Probability of voicing is given from a logistic regression formula using `rms` and `c` to predict the voicing state as determined by EGG data using the function `phonlab.egg_to_oq()` over the 10 speakers in the ASC corpus of Mandarin speech. The prediction of the EGG voicing decision was about 86% correct. Parameters ========== y : ndarray A one-dimensional array of audio samples fs : int Sampling rate of **x** f0_range : list of two integers, default = [63,400] The lowest and highest values to consider in pitch tracking. s : float, default = 0.005 "Hop" interval between successive analysis windows. The default is 5 milliseconds Returns ======= df: pandas DataFrame measurements at 5 msec intervals. Note ==== The columns in the returned dataframe are for each frame of audio: * sec - time at the midpoint of each frame * f0 - estimate of the fundamental frequency * rms - peak normalized rms amplitude in the band from 0 to fs/2 * c - value of the peak autocorrelation found in the frame * HNR - an estimate of the harmonics to noise ratio * probv - probability of being voiced * voiced - a boolean, true if probv > 0.5 References ========== P. Boersma (1993) Accurate short-term analysis of the fundamental frequency and the harmonics-to-noise ratio of a sampled sound. `Institute of Phonetic Sciences, Amsterdam University, Proceedings`. **17**, 97-110. .. figure:: images/get_f0_B93.png :scale: 33 % :alt: a spectrogram with three pitch traces compared - get_f0_acd, get_f0_B93, praat :align: center Comparing the f0 found by `phon.get_f0()` plotted in blue, and the f0 values found by `parselmouth` `to_Pitch()`, plotted with chartreuse dots, and the f0 values found by get_f0_acd, plotted in orange. The traces are offset from each other by 10Hz so they can be seen. """ x, fs = prep_audio(y, fs, target_fs = None, pre = 0.0, quiet=True) # ---- setup constants and global variables ----- step = int(fs * s) # number of samples between frames s_lag = int((1/f0_range[1])*fs) # shortest lag l_lag = int((1/f0_range[0])*fs) # longest lag frame_length = int(l_lag * 3) # room for 3 periods (6 for HNR) half_frame = frame_length//2 frame_length = half_frame * 2 + 1 # odd number in frame N = 1024 while (frame_length+frame_length//2 > N): N = N * 2 # increase fft size if needed # ----- split into frames -------- frames = util.frame(x, frame_length=frame_length, hop_length=step,axis=0) nb = frames.shape[0] f = frames.shape[1] # number of frequency steps # ----- Hann window and its autocorrelation ---------- w = windows.hann(frame_length) s = fft.fft(w,N) + 0.000001 rw = fft.fft(np.square(np.abs(s)),N) rw = rw/np.max(rw) # ------- autocorrelations of all of the frames in the file ----------- Sxx = fft.fft(w*frames,N)+0.000001 ra = fft.fft(np.square(np.abs(Sxx)),N) ra = np.divide(ra.T,np.max(ra,axis= -1)).T # frame by frame normalization # ------ normalized autocorrelations ------------ rx = ra/rw # ------ find best lag in each frame ------- lag = np.array([s_lag + np.argmax(rx[i,s_lag:l_lag]) for i in range(nb)]) # ---- compute columns for Dataframe ------- sec = (np.array(range(nb)) * step + half_frame)/fs f0 = 1/(lag/fs) # convert lags into f0 rms = 20 * np.log10(np.sqrt(np.sum(np.square(np.abs(Sxx)),axis=-1))) c = np.array([np.abs(np.max(rx[i,s_lag:l_lag])) for i in range(nb)]) c = np.where(c>=1,0.999,c) HNR = 10 * np.log10(c/(1-c),where=np.where(c<1,True,False),out=np.zeros(c.shape)) # voicing decision odds = np.exp(-5.92 + (0.133*rms) + (3.324*c)) # logistic formula, trained on ASC corpus probv = odds / (1 + odds) Voiced = probv > 0.5 #Voiced = ((rms - np.mean(rms)) + HNR) > 4 return DataFrame({'sec': sec, 'f0':f0, 'rms':rms, 'c':c, 'HNR': HNR, 'probv': probv, 'voiced': Voiced})
def SRH(Sxx,fs,f0_range): ''' test all of the f0 values (integers) between the min and max of the range and choose the one with the greatest sum of residual harmonics in each frame of the spectrogram. ''' nb = Sxx.shape[0] T = Sxx.shape[1]/fs # interval between frequency steps max_harmonic = 7 f0 = np.empty(nb) SRHval = np.empty((nb)) plus = np.empty((f0_range[1]-f0_range[0],max_harmonic-1),dtype=np.int16) minus = np.empty((f0_range[1]-f0_range[0],max_harmonic-1),dtype=np.int16) # build matrix of frequency indices for each harmonic of each possible f0 for f in range(f0_range[0],f0_range[1]): fT = f*T for k in range(1,max_harmonic): plus[f-f0_range[0],k-1] = int(fT*k) # indeces in the spectrum minus[f-f0_range[0],k-1] = int(fT*(k+0.5)) for n in range(nb): S = Sxx[n] srh = np.sum(S[plus],axis=-1) - np.sum(S[minus],axis=-1) max_srh = np.max(srh) idx_srh = np.argmax(srh) f0[n] = f0_range[0]+idx_srh SRHval[n] = max_srh return f0,SRHval
[docs] def get_f0_srh(y, fs, f0_range = [60,400], isResidual = False, l = 0.06, s=0.005, vthresh=0.07): """Track the fundamental frequency of voicing (f0), using a frequency domain method. This function is an implementation of Drugman and Alwan's (2011) "Summation of Residual Harmonics" (SRH) method of pitch tracking. The signal is downsampled to 16 kHz, and inverse filtered with LPC analysis to remove the influence of vowel formants. Then harmonics are found in the spectrum of the residual signal. Drugman and Alwan found that this technique provides an estimate of F0 that is robust when the audio signal is corrupted by noise. The f0 range is adaptively adjusted. Probability of voicing is given from a logistic regression formula using `rms` and `srh` trained to predict the voicing state as determined by EGG data using the function `phonlab.egg_to_oq()` over the 10 speakers in the ASC corpus of Mandarin speech. The prediction of the EGG voicing decision was about 84% correct. Parameters ========== y : string or ndarray A one-dimensional array of audio samples fs : int Sampling rate of **x** f0_range : list of two integers, default = [60,400] The lowest and highest values to consider in pitch tracking. This algorithm is quite sensitive to the values given in this setting. isResidual : Boolean, default = False If the input array is a residual signal s : float, default = 0.005 "Hop" interval between successive analysis windows. The default is 5 milliseconds Returns ======= df : pandas DataFrame measurements at 5 msec intervals. Note ==== The columns in the returned dataframe are for each frame of audio: * sec - time at the midpoint of each frame * f0 - estimate of the fundamental frequency * rms - rms amplitude in the band from 0 to 5 kHz * srh - value of SRH (normalized sum of the residual harmonics) * probv - estimated probability of voicing * voiced - a boolean decision based on the srh value (see Drugman and Alwan) References ========== T. Drugman, A. Alwan (2011) Joint robust voicing detection and pitch estimation based on residual harmonics. 'ISCA (Florence, Italy)' pp. 1973ff """ x,fs = prep_audio(y, fs, target_fs=16000, pre = 0, quiet=True) # resample, preemphasis frame_length = int(fs * l) # frame size in samples half_frame = frame_length//2 frame_length = half_frame * 2 + 1 # odd number in frame step = int(fs * s) # number of samples between frames # ----- get rms amplitude from audio wav ------------- rms = feature.rms(y=x,frame_length=frame_length,hop_length=step,center=False) rms = 20 * np.log10(rms[0]) # ---- get the f0 from the sum of the residual harmonics (srh) ------------- if isResidual: resid = x else: resid,fs = lpcresidual(x,fs) # get the lpc residual signal w = windows.hamming(frame_length) frames = util.frame(resid,frame_length=frame_length, hop_length=step,axis=0) frames = np.multiply(frames,w) # apply a Hamming window to each frame, for lpc Sxx = np.abs(np.fft.rfft(frames,2**14))+0.00001 # spectrogram of the residual Sxx = np.divide(Sxx.T,linalg.norm(Sxx,axis=-1)).T # amplitude normalized f0,SRHval = SRH(Sxx,fs,f0_range) F0med = int(np.nanmedian(np.where(SRHval<0.1,f0,np.nan))) oldF0med,iters = 0, 0 # iterate once to adjust the pitch range while F0med != oldF0med and iters<1 and np.max(SRHval) > 0.1: oldF0med = F0med iters += 1 f0_range[1] = int(F0med) + 100 # only adjusting the top end of the range f0,SRHval = SRH(Sxx,fs,f0_range) # recalculate F0med = int(np.nanmedian(np.where(SRHval<0.1,f0,np.nan))) # ---------- get voicing decisions -------------- odds = np.exp(3.43 + (0.155*rms) + (13.407*SRHval)) # logistic formula, trained on ASC corpus probv = odds / (1 + odds) voiced = probv > 0.5 # vthresh = 0.06 # Drugman et al voicing decision #if np.std(SRHval) > 0.05: vthresh = vthresh*1.2 #voiced = np.where(SRHval > vthresh,True,False) # ---- get the time at the center of each frame --------------- sec = (np.array(range(frames.shape[0])) * step + half_frame).astype(int)/fs return DataFrame({'sec': sec, 'f0':f0, 'rms':rms, 'srh':SRHval, 'probv': probv, 'voiced': voiced})
def get_f0_ac(y, fs, f0_range = [60,400], l=0.05, s=0.005): """Track the fundamental frequency of voicing (f0), using autocorrelation. This function implements a simple autocorrelation method of pitch tracking with no filtering prior to calculating the autocorrelation. Probability of voicing is given from a logistic regression formula using `rms` and `c` trained to predict the voicing state as determined by EGG data using the function `phonlab.egg_to_oq()` over the 10 speakers in the ASC corpus of Mandarin speech. The prediction of the EGG voicing decision was about 88% correct. Parameters ========== y : ndarray A one-dimensional array of audio samples fs : int Sampling rate of **x** f0_range : list of two integers, default = [63,400] The lowest and highest values to consider in pitch tracking. l : float, default = 0.05 Length of the pitch analysis window in seconds. The default is 50 milliseconds. s : float, default = 0.005 "Hop" interval between successive analysis windows. The default is 5 milliseconds Returns ======= df: pandas DataFrame measurements at 5 msec intervals. Note ==== The columns in the returned dataframe are for each frame of audio: * sec - time at the midpoint of each frame * f0 - estimate of the fundamental frequency * rms - peak normalized rms amplitude in the band from 0 to fs/2 * c - value of the peak autocorrelation found in the frame * probv - estimated probability of voicing * voiced - a boolean, true if probv>0.5 """ x, fs = prep_audio(y, fs, target_fs = 32000, pre = 0, quiet=True) short = fs//f0_range[1] # period of highest allowable frequency - shortest lag long = fs//f0_range[0] # period of lowest allowable frequency - longest lag frame_length = int(fs * l) # room for periods half_frame = frame_length//2 frame_length = half_frame * 2 + 1 # odd number in frame step = int(fs * s) # number of samples between frames rms = feature.rms(y=x,frame_length=frame_length, hop_length=step,center=False) rms = 20*np.log10(rms[0]/np.max(rms[0])) frames = util.frame(x, frame_length=frame_length, hop_length=step,axis=0) nb = frames.shape[0] # the number of frames (or blocks) in the LPC analysis sec = (np.array(range(nb)) * step + half_frame).astype(int)/fs f0 = np.empty(nb) c = np.empty((nb)) # N/(N-t) ac_norm = [frame_length/(frame_length - k) for k in range(frame_length)] for i in range(nb): cormat = np.correlate(frames[i], frames[i], mode='full') # autocorrelation ac = cormat[cormat.size//2:] # the autocorrelation is in the last half of the result ac = ac*ac_norm idx = np.argmax(ac[short:long]) + short # index of peak correlation (in range lowest to highest) f0[i] = 1/(idx/fs) # converted to Hz if (ac[0]<= 0) | (ac[idx] <= 0): c[i] = 0 else: c[i] = np.sqrt(ac[idx]) / np.sqrt(ac[0]) HNR = 10 * np.log10(c/(1-c),where=np.where(c<1,True,False),out=np.zeros(c.shape)) odds = np.exp(0.48 + (0.19*rms) + (5.44*c)) # logistic formula, trained on ASC corpus probv = odds / (1 + odds) voiced = probv > 0.5 return DataFrame({'sec': sec, 'f0':f0, 'rms':rms, 'c':c, 'HNR':HNR, 'probv': probv, 'voiced':voiced}) def f0_from_harmonics(f_p,i,h,nh): ''' Assign harmonic numbers to the peaks in f_p -- this function is used in get_f0_acd f_p: an array of peak frequencies i: the starting peak to look at (0,n) h: the starting harmonic number to assign to this peak (1,n-1) ''' Np = len(f_p) # number of peaks m = np.zeros(Np) f0 = [] m[i] = h f0 = np.append(f0, f_p[i]/h) # f0 if peak i is harmonic h thresh = 0.055 * f0[0] # 5.5% of the f0 value ex = 0 # number of harmonics over h=11 for j in range(i+1,Np): # step through the spectral peaks lowest_deviation = 1000 best_f0 = np.nan for k in range(h+1,nh+1): # step through harmonics test_f0 = f_p[j]/k deviation = abs(test_f0 - f0[0]) if deviation < lowest_deviation: # pick the best harmonic number for this peak lowest_deviation = deviation best_f0 = test_f0 best_k = k if lowest_deviation < thresh: # close enough to be a harmonic m[j] = best_k f0 = np.append(f0,best_f0) if (h>11): ex = ex + 1 h=h+1 C = ((h-1) + (Np - ex))/ np.count_nonzero(m) mean_f0 = np.average(f0,weights=np.arange(len(f0))+1) return C,mean_f0
[docs] def get_f0_acd(y, fs, f0_range=[60,400], l=0.05, s=0.005, prom=5, min_height = 0.6, test_time=-1): """Track the fundamental frequency of voicing, using a frequency domain method. This function implements the 'approximate common denominator" algorithm proposed by Aliik, Mihkla and Ross (1984), which was an improvement on the method proposed by Duifuis, Willems and Sluyter (1982). The algorithm finds candidate harmonic peaks in the spectrum, and chooses a value of f0 that best predicts the harmonic pattern. One feature of this method is that it reports a voice quality measure (the difference in the amplitudes of harmonic 1 and harmonic 2). Probability of voicing is given from a logistic regression formula using `rms`, the `h2h1` ratio, and Duifuis et al.'s harmonicity criterion `C` to predict the voicing state as determined by EGG data using the function `phonlab.egg_to_oq()` over the 10 speakers in the ASC corpus of Mandarin speech. The prediction of the EGG voicing decision was about 86% correct. Aliik et al. (1984) used a cutoff of C < 3.5 as a voicing threshold. Parameters ========== y : ndarray A one-dimensional array of audio samples fs : int the sampling rate of the audio in **y**. f0_range : a list of two integers, default=[60,400] The lowest and highest values to consider in pitch tracking. The algorithm is not particularly sensitive to this parameter, but it can be useful in avoiding pitch-halving or pitch-doubling. l : float, default = 0.05 Length of the pitch analysis window in seconds. The default is 50 milliseconds. s : float, default = 0.005 "Hop" interval between successive analysis windows. The default is 5 milliseconds prom : numeric, default = 5 dB In deciding whether a peak in the spectrum is a possible harmonic, this prominence value is passed to scipy.find_peaks(). A larger value means that the spectral peak must be more prominent to be considered as a possible harmonic peak, and thus the algorithm is less likely to report pitch values when the parameter is given a high value. In general, 20 is a high value, and 3 is low. min_height: numeric, default = 0.6 As a proportion of the range between the lowest amplitude in the spectrum and the highest, only peaks above `min_height` will be considered to be harmonics. The value that is passed to find_peaks() is: `amplitude_min + min_height*(amplitude_range)`. Returns ======= df: pandas DataFrame measurements at 5 msec intervals. Note ==== The columns in the returned dataframe are for each frame of audio: * sec - time at the midpoint of each frame * f0 - estimate of the fundamental frequency * rms - rms amplitude in a low frequency band from 0 to 1200 Hz * h1h2 - the difference in the amplitudes of the first two harmonics (H1 - H2) in dB * C - harmonicity criterion (lower values indicate stronger harmonic pattern) * probv - estimated probability of voicing * voiced - a boolean, true if probv>0.5 References ========== J. Allik, M. Mihkla, J. Ross (1984) Comment on "Measurement of pitch in speech: An implementation of Goldstein's theory of pitch perception" [JASA 71, 1568 (1982)]. `JASA` 75(6), 1855-1857. H. Duifhuis & L.F. Willems (1982) Measurement of pitch in speech: An implementation of Goldstein's theory of pitch perception. `JASA` 71(6), 1568-1580. Example ======= .. code-block:: Python example_file = importlib.resources.files('phonlab') / 'data' / 'example_audio' / 'stereo.wav' x,fs = phon.loadsig(example_file, chansel=[0]) f0df = phon.get_f0_acd(x,fs,prom=18) snd = parselmouth.Sound(str(example_file)).extract_left_channel() # create a Praat Sound object pitch = snd.to_pitch() # create a Praat pitch object f0df2 = phon.pitch_to_df(pitch) # convert it into a Pandas dataframe ret = phon.sgram(x,fs,cmap='Grays') # draw a spectrogram of the sound f0_range = [60,400] ax1 = ret[0] # get the plot axis ax2 = ax1.twinx() # and twin it for plotting f0 ax2.plot(f0df2.sec,f0df2.f0, color='chartreuse',marker="s",linestyle = "none") ax2.plot(f0df.sec,f0df.f0, color='dodgerblue',marker="d",linestyle = "none") ax2.set_ylim(f0_range) ax2.set_ylabel("F0 (Hz)", size=14) for item in ax2.get_yticklabels(): item.set_fontsize(14) .. figure:: images/acd_pitch_trace.png :scale: 33 % :alt: a spectrogram with a pitch trace calculated by get_f0_acd :align: center Comparing the f0 found by `phon.get_f0_acd()` plotted with blue diamonds, and the f0 values found by `parselmouth` `to_Pitch()`, plotted with chartreuse dots. """ nh = 6 # maximum number of harmonics to consider down_fs = nh*400 # down sample frequency x, fs = prep_audio(y, fs, target_fs = down_fs, pre=0,quiet=True) # no preemphasis N = 1024 # FFT size frame_len = int(fs*l) # 40 ms frame half_frame = frame_len//2 frame_length = half_frame * 2 + 1 # odd number in frame step = int(fs*s) # stride between frames noverlap = frame_len - step # points of overlap between successive frames while (frame_len > N): N = N * 2 # increase fft size if needed w = windows.hamming(frame_len) f,ts,Sxx = spectrogram(x,fs=fs,noverlap = noverlap, window=w, nperseg = frame_len, nfft = N, scaling = 'spectrum', mode = 'magnitude', detrend = 'linear') Sxx = Sxx + 0.0001 rms = 20 * np.log10(np.sqrt(np.divide(np.sum(np.square(Sxx),axis=0),len(f)))) Sxx = 20 * np.log10(Sxx) nb = len(ts) # the number of frames in the spectrogram f0 = np.full(nb,np.nan) # array filled with nan h1h2 = np.full(nb,np.nan) # array filled with nan c = np.full(nb,9.0) # default value of c is 9.0 min_dist = int(f0_range[0]/(fs/N)) # min distance btw harmonics max_dist = int(f0_range[1]/(fs/N)) dist = int((min_dist + max_dist)/2) ## temp if test_time>0: i_test = np.argmin(np.fabs(test_time-ts)) # the ts that is closest to this else: i_test = -1 ## temp for idx in range(nb): spec = Sxx[:,idx] height = np.min(spec) + min_height * np.abs(np.max(spec)-np.min(spec)) # required height of a peak peaks,props = find_peaks(spec, height = height, prominence=prom, distance = min_dist, wlen=dist) if len(peaks)>2: # we did find some harmonics? for p in range(3): # for each of the first three spectral peaks for h in range(1,5): # treat it as one of the first four harmonics if (h==p*2): break C,_f0 = f0_from_harmonics(f[peaks],p,h,nh) if idx==i_test: print(f'_f0: {_f0:0.1f}, peak: {p}, harmonic: {h}, C: {C:0.2f}') if (f0_range[0] < _f0) & (_f0 < f0_range[1]) & (C < c[idx]): c[idx] = C i_f0 = np.argmin(np.fabs(_f0 - f)) # f index that is closest to f0 i_2f0 = np.argmin(np.fabs((2 * _f0) - f)) # closest to 2f0 for (h1h2) h1h2[idx] = spec[i_f0] - spec[i_2f0] f0[idx] = _f0 if idx==i_test: # show diagnostic info, only at a target frame if len(peaks)<nh: # the highest harmonic number to consider n = len(peaks) else: n = nh plt.plot(f,spec) plt.vlines(f[peaks[0:n]],np.min(spec),np.max(spec)) plt.axhline(height) print("number of peaks: ",len(peaks), "start time: ", ts[0]) print(f'min_dist = {min_dist}, max_dist = {max_dist}, down_fs={fs}, len(f)={len(f)}, N={N}') print(f"median difference between adjacent peaks {np.median(np.ediff1d(f[peaks[0:n]])):0.2f}") print(f"frequency of the lowest peak {f[peaks[0]]:0.2f}") print(f'mean prominence: {np.mean(props["prominences"][0:n]):0.3f} mean peak: {np.mean(props["peak_heights"][0:n]):0.3f}') print(f"height = {height:0.2f},max={np.max(spec):0.2f}, min={np.min(spec):0.2f}, c={best_c}") print(f"time = {test_time}, f0 = {f0[idx]:0.2f}, h1h2 = {h1h2[idx]:0.2f}") odds = np.exp(8.34 + (0.167*rms) - (0.072*c)) # logistic formula, trained on ASC corpus probv = odds / (1 + odds) voiced = probv > 0.5 return DataFrame({'sec': ts, 'f0':f0, 'rms':rms, 'h1h2':h1h2, 'C':c, 'probv': probv, 'voiced':voiced})