Source code for phonlab.acoustic.DPPT

from scipy import fft
from scipy.signal import windows
from librosa import util
from pandas import DataFrame
import numpy as np
from numba import jit
from ..utils.prep_audio_ import prep_audio

def formantPeakPick(spec,n=6):  # maybe replace with scipy.pickpeaks ??
    '''
    spec is a one dimensional array, and we look for peaks that are at least (n-1)*2 points wide

    return an array of indices - the locations of the peaks
    '''
    peakIndex=np.array([]).astype(np.float32)
    for i in range(n,len(spec)-n):
        is_peak = True
        for k in range(1,n):  is_peak = is_peak and spec[i]>=spec[i-k] and spec[i]>=spec[i+k]
        if is_peak:  peakIndex = np.append(peakIndex,i)
            
    return peakIndex

    
[docs] def DPPT_formants(x,fs, pre=0, l=0.03, s=0.01, deltaF=1100, n=6): '''Bozkurt et al.'s (2004) Differential-Phase Peak Tracking (DPPT) method of vowel formant tracking is implemented in this function. The implementation here is a translation and interpretation of the matlab routine `formant_CGDZP()` written by Bozkurt and Drugman and published in the Covarep repository. This algorithm tracks formants as peaks in the smoothed spectrum, therefore tends to be more likely than the LPC or IFC methods to track the harmonics of the voice fundamental frequency when pitch is high, and to report a single peak when formants have similar frequencies - reporting the same formant frequency for the close formants, such as when F1 and F2 are close in [u] or F2 and F3 are close in [i]. It may be that measuring the peaks in a smoothed spectrum is a more perceptually valid representation of vowel acoustics than a model that focusses on determining the resonances of the vocal tract (see Chistovich & Lublinskaya, 1979). Thus, it is appropriate to consider the formant frequencies reported by this DPPT algorithm to be `perceptual formants`. Parameters ---------- x : array a one-dimensional array of audio samples fs : int the sampling rate of the audio in **x** deltaF : int, default = 1100 The expected average interval between formants, used to sort peaks into formant numbers. preemphasis : float, default = 0 value of the preemphasis factor (0-1). It almost never helps to change this. l : float, default = 0.03 Length of the pitch analysis window in seconds. The default is 30 milliseconds. s : float, default = 0.01 "Hop" interval between successive analysis windows. The default is 10 milliseconds n : int, default = 6 The required width of a peak, in terms of the number of steps in the spectrum. The default as in Bozkurt et al. is about 80 Hz wide. Returns ------- df : dataframe a pandas dataframe with formant measurements at 10 ms intervals (by default). Note ---- The columns in the output dataframe are: * sec - measurement time * F1-4 - the lowest four vowel formants - peaks in the smoothed spectrum. References ---------- B. Bozkurt, B. Doval, C.D. Alessandro, T. Dutoit (2004) Improved differential phase spectrum processing for formant tracking. InterSpeech ICSLP, 8th International Conference on Spoken Language Processing, Jeju Island, Korea. L. A. Chistovich, V.V Lublinskaya (1979) The 'center of gravity' effect in vowel spectra and critical distance between the formants: Psychoacoustical study of the perception of vowel-like stimuli. `Hearing Research`, 1, 185-195. Examples -------- This code block shows a call to phon.track_formants_DPPT() and also shows a call to LPC formant tracking, the default method in phon.track_formants(), and uses phon.sgram, and seaborn pointplot() to plot the estimated formant frequences on the spectrogram. .. code-block:: Python example_file = importlib.resources.files('phonlab') / 'data/example_audio/sf3_cln.wav' x,fs = phon.loadsig(example_file,chansel=[0]) ret = phon.sgram(x,fs, tf=6000, cmap="Reds") # plot the spectrogram dflpc = phon.track_formants(x,fs) dot_color = 'dodgerblue' sns.pointplot(dflpc,x='sec',y='F1',linestyle='none',native_scale=True,marker=".",color=dot_color) sns.pointplot(dflpc,x='sec',y='F2',linestyle='none',native_scale=True,marker=".",color=dot_color) sns.pointplot(dflpc,x='sec',y='F3',linestyle='none',native_scale=True,marker=".",color=dot_color) sns.pointplot(dflpc,x='sec',y='F4',linestyle='none',native_scale=True,marker=".",color=dot_color) df = phon.track_formants_DPPT(x,fs,deltaF=1100) dot_color = "darkblue" sns.pointplot(df,x='sec',y='F1',linestyle='none',native_scale=True,marker=".",color=dot_color) sns.pointplot(df,x='sec',y='F2',linestyle='none',native_scale=True,marker=".",color=dot_color) sns.pointplot(df,x='sec',y='F3',linestyle='none',native_scale=True,marker="x",color=dot_color) sns.pointplot(df,x='sec',y='F4',linestyle='none',native_scale=True,marker=".",color=dot_color) .. figure:: images/DPPT.png :scale: 33% :alt: a spectrogram with the estimated vowel formants marked with light and dark blue dots :align: center Plotting the formants found by `track_formants()` (light blue dots) and those found by `track_formants_DPPT()` (dark blue dots) on the spectrogram of the utterance. ''' x,fs = prep_audio(x, fs, target_fs=16000, pre=pre, quiet=True) # resample to 16kHz numFormants = 5 fsLR = 2048 viewRange = round(fsLR/3.2) frame_length = int(l*fs) step = int(s*fs) half_frame = round(frame_length/2) frame_length = half_frame * 2 + 1 # odd number in frame NFFT = int(2**(np.ceil(np.log(frame_length)/np.log(2)))) n_list = range(NFFT) # 0,1,2,3,...NFFT-1 #ScalingFreqAxis = np.arange(0.5,1.5,1/(viewRange-1)) frames = util.frame(x,frame_length=frame_length, hop_length=step,axis=0) nb = frames.shape[0] f = frames.shape[1] w = windows.gaussian(frame_length,half_frame) # calculate the zero-phase spectrogram zero_phase = np.real(fft.ifft(np.abs(fft.fft(np.diff(w*frames,append=0),NFFT)))) Rfix = meanR = 1.12 Rlist = np.array([]) # history of found R values ts = (np.array(range(nb)) * step + half_frame)/fs formants = np.ones((nb,numFormants))*np.nan # fill with NaN for kk in range(nb): numPeaks = 0 R = meanR # using past values of R seems to speed up processing dramatically # -- adjust R: the radius of the analysis circle in Z plane ----- while (numPeaks!=numFormants and R>1.01 and R<1.25): expEnv = np.exp(np.log(1/R)*n_list) # consider precomputing these and use table look up here fourierTrans = fft.fft(zero_phase[kk]*expEnv,fsLR) angFFT = np.angle(fourierTrans[1:viewRange]) chirpGroupDelay = -np.diff(angFFT) # the chirp group delay spectrum peakIndex = formantPeakPick(chirpGroupDelay,n) numPeaks = len(peakIndex) if numPeaks>numFormants and R>=Rfix: peakIndex = peakIndex[0:numFormants] R += 0.01 elif numPeaks<numFormants and R<=Rfix: peakIndex = np.append(peakIndex,np.zeros(numFormants-len(peakIndex))) R -= 0.01 else: break if numPeaks>numFormants: peakIndex = peakIndex[0:numFormants] elif numPeaks<numFormants: peakIndex = np.append(peakIndex,np.zeros(numFormants-len(peakIndex))) Rlist = np.append(Rlist,R) meanR = np.mean(Rlist) # use past values in future frames #print(f"{kk}, {ts[kk]}, {meanR:0.3f},{peakIndex*fs/fsLR}") formants[kk,:] = peakIndex formants = formants*fs/fsLR # assign peaks to formants 1-4 based on expected formant values Fs_expected = np.zeros(5) Fs_expected[0] = deltaF/2 # set expectations for fornant frequencies based on deltaF for i in range(1,5): Fs_expected[i] = Fs_expected[i-1] + deltaF # tricky line - gives index number for each peak based on which expected formant it is closest to idxs = np.reshape(np.concatenate([np.nanargmin(np.fabs(formants-Fs_expected[i]),axis=-1) for i in range(numFormants)]),(numFormants,nb)).T newFs = [formants[i,idxs[i,:]] for i in range(nb)] list = ['sec', ] + [f"F{i}" for i in range(1,numFormants+1)] return DataFrame(np.concatenate((ts.reshape(nb,1), newFs), axis=1),columns=(list))