Source code for phonlab.acoustic.DPPT
from scipy import fft
from scipy.signal import windows
from librosa import util
from pandas import DataFrame
import numpy as np
from numba import jit
from ..utils.prep_audio_ import prep_audio
def formantPeakPick(spec,n=6): # maybe replace with scipy.pickpeaks ??
'''
spec is a one dimensional array, and we look for peaks that are at least (n-1)*2 points wide
return an array of indices - the locations of the peaks
'''
peakIndex=np.array([]).astype(np.float32)
for i in range(n,len(spec)-n):
is_peak = True
for k in range(1,n): is_peak = is_peak and spec[i]>=spec[i-k] and spec[i]>=spec[i+k]
if is_peak: peakIndex = np.append(peakIndex,i)
return peakIndex
[docs]
def DPPT_formants(x,fs, pre=0, l=0.03, s=0.01, deltaF=1100, n=6):
'''Bozkurt et al.'s (2004) Differential-Phase Peak Tracking (DPPT) method of vowel formant
tracking is implemented in this function. The implementation here is a translation and
interpretation of the matlab routine `formant_CGDZP()` written by Bozkurt and Drugman and
published in the Covarep repository.
This algorithm tracks formants as peaks in the smoothed spectrum, therefore tends to
be more likely than the LPC or IFC methods to track the harmonics of the voice
fundamental frequency when pitch is high, and to report a single peak when
formants have similar frequencies - reporting the same formant frequency for
the close formants, such as when F1 and F2 are close in [u] or F2 and F3 are close
in [i].
It may be that measuring the peaks in a smoothed spectrum is a more
perceptually valid representation of vowel acoustics than a model that focusses on
determining the resonances of the vocal tract (see Chistovich & Lublinskaya, 1979).
Thus, it is appropriate to consider the formant frequencies reported by this DPPT
algorithm to be `perceptual formants`.
Parameters
----------
x : array
a one-dimensional array of audio samples
fs : int
the sampling rate of the audio in **x**
deltaF : int, default = 1100
The expected average interval between formants, used to sort peaks into formant numbers.
preemphasis : float, default = 0
value of the preemphasis factor (0-1). It almost never helps to change this.
l : float, default = 0.03
Length of the pitch analysis window in seconds. The default is 30 milliseconds.
s : float, default = 0.01
"Hop" interval between successive analysis windows. The default is 10 milliseconds
n : int, default = 6
The required width of a peak, in terms of the number of steps in the spectrum. The default as in Bozkurt et al. is about 80 Hz wide.
Returns
-------
df : dataframe
a pandas dataframe with formant measurements at 10 ms intervals (by default).
Note
----
The columns in the output dataframe are:
* sec - measurement time
* F1-4 - the lowest four vowel formants - peaks in the smoothed spectrum.
References
----------
B. Bozkurt, B. Doval, C.D. Alessandro, T. Dutoit (2004) Improved differential phase spectrum processing for formant tracking. InterSpeech ICSLP, 8th International Conference on Spoken Language Processing, Jeju Island, Korea.
L. A. Chistovich, V.V Lublinskaya (1979) The 'center of gravity' effect in vowel spectra and critical distance between the formants: Psychoacoustical study of the perception of vowel-like stimuli. `Hearing Research`, 1, 185-195.
Examples
--------
This code block shows a call to phon.track_formants_DPPT() and also shows a call to LPC formant tracking, the
default method in phon.track_formants(), and uses phon.sgram, and seaborn pointplot() to plot the estimated
formant frequences on the spectrogram.
.. code-block:: Python
example_file = importlib.resources.files('phonlab') / 'data/example_audio/sf3_cln.wav'
x,fs = phon.loadsig(example_file,chansel=[0])
ret = phon.sgram(x,fs, tf=6000, cmap="Reds") # plot the spectrogram
dflpc = phon.track_formants(x,fs)
dot_color = 'dodgerblue'
sns.pointplot(dflpc,x='sec',y='F1',linestyle='none',native_scale=True,marker=".",color=dot_color)
sns.pointplot(dflpc,x='sec',y='F2',linestyle='none',native_scale=True,marker=".",color=dot_color)
sns.pointplot(dflpc,x='sec',y='F3',linestyle='none',native_scale=True,marker=".",color=dot_color)
sns.pointplot(dflpc,x='sec',y='F4',linestyle='none',native_scale=True,marker=".",color=dot_color)
df = phon.track_formants_DPPT(x,fs,deltaF=1100)
dot_color = "darkblue"
sns.pointplot(df,x='sec',y='F1',linestyle='none',native_scale=True,marker=".",color=dot_color)
sns.pointplot(df,x='sec',y='F2',linestyle='none',native_scale=True,marker=".",color=dot_color)
sns.pointplot(df,x='sec',y='F3',linestyle='none',native_scale=True,marker="x",color=dot_color)
sns.pointplot(df,x='sec',y='F4',linestyle='none',native_scale=True,marker=".",color=dot_color)
.. figure:: images/DPPT.png
:scale: 33%
:alt: a spectrogram with the estimated vowel formants marked with light and dark blue dots
:align: center
Plotting the formants found by `track_formants()` (light blue dots) and those found by
`track_formants_DPPT()` (dark blue dots) on the spectrogram of the utterance.
'''
x,fs = prep_audio(x, fs, target_fs=16000, pre=pre, quiet=True) # resample to 16kHz
numFormants = 5
fsLR = 2048
viewRange = round(fsLR/3.2)
frame_length = int(l*fs)
step = int(s*fs)
half_frame = round(frame_length/2)
frame_length = half_frame * 2 + 1 # odd number in frame
NFFT = int(2**(np.ceil(np.log(frame_length)/np.log(2))))
n_list = range(NFFT) # 0,1,2,3,...NFFT-1
#ScalingFreqAxis = np.arange(0.5,1.5,1/(viewRange-1))
frames = util.frame(x,frame_length=frame_length, hop_length=step,axis=0)
nb = frames.shape[0]
f = frames.shape[1]
w = windows.gaussian(frame_length,half_frame)
# calculate the zero-phase spectrogram
zero_phase = np.real(fft.ifft(np.abs(fft.fft(np.diff(w*frames,append=0),NFFT))))
Rfix = meanR = 1.12
Rlist = np.array([]) # history of found R values
ts = (np.array(range(nb)) * step + half_frame)/fs
formants = np.ones((nb,numFormants))*np.nan # fill with NaN
for kk in range(nb):
numPeaks = 0
R = meanR # using past values of R seems to speed up processing dramatically
# -- adjust R: the radius of the analysis circle in Z plane -----
while (numPeaks!=numFormants and R>1.01 and R<1.25):
expEnv = np.exp(np.log(1/R)*n_list) # consider precomputing these and use table look up here
fourierTrans = fft.fft(zero_phase[kk]*expEnv,fsLR)
angFFT = np.angle(fourierTrans[1:viewRange])
chirpGroupDelay = -np.diff(angFFT) # the chirp group delay spectrum
peakIndex = formantPeakPick(chirpGroupDelay,n)
numPeaks = len(peakIndex)
if numPeaks>numFormants and R>=Rfix:
peakIndex = peakIndex[0:numFormants]
R += 0.01
elif numPeaks<numFormants and R<=Rfix:
peakIndex = np.append(peakIndex,np.zeros(numFormants-len(peakIndex)))
R -= 0.01
else:
break
if numPeaks>numFormants:
peakIndex = peakIndex[0:numFormants]
elif numPeaks<numFormants:
peakIndex = np.append(peakIndex,np.zeros(numFormants-len(peakIndex)))
Rlist = np.append(Rlist,R)
meanR = np.mean(Rlist) # use past values in future frames
#print(f"{kk}, {ts[kk]}, {meanR:0.3f},{peakIndex*fs/fsLR}")
formants[kk,:] = peakIndex
formants = formants*fs/fsLR
# assign peaks to formants 1-4 based on expected formant values
Fs_expected = np.zeros(5)
Fs_expected[0] = deltaF/2 # set expectations for fornant frequencies based on deltaF
for i in range(1,5): Fs_expected[i] = Fs_expected[i-1] + deltaF
# tricky line - gives index number for each peak based on which expected formant it is closest to
idxs = np.reshape(np.concatenate([np.nanargmin(np.fabs(formants-Fs_expected[i]),axis=-1) for i in range(numFormants)]),(numFormants,nb)).T
newFs = [formants[i,idxs[i,:]] for i in range(nb)]
list = ['sec', ] + [f"F{i}" for i in range(1,numFormants+1)]
return DataFrame(np.concatenate((ts.reshape(nb,1), newFs), axis=1),columns=(list))