Source code for phonlab.acoustic.h2h1_

import numpy as np
from scipy.signal import iirpeak,filtfilt,windows, convolve, find_peaks
from scipy import fft
from librosa.util import frame


[docs]
def h2h1(resid,fs,f0df,f0med=None,use_ac=True):
    '''Estimate the amplitude ratio of harmonic 2 and harmonic 1, in decibels.  This function 
uses the method described by Drugman, Kane & Gobl (2012), and was inspired by the matlab
function `get_creak_h2h1()` published in the Covarep Matlab repository of speech analysis software.  
The h2h1 measurements given by this function are correlated with those produced by `phonlab.get_f0_acd()`
but the measurements from `h2h1()` should be used when testing for creaky voice.  

The method here is to:
    * apply a narrow (150Hz) filter centered on the f0med, to the LPC residual (see `phonlab.lpcresidual()`) 
    * produce an autocorrelation of each frame
    * produce a spectrum of the autocorrelation function
    * based on the measured F0 for the frame, pick the spectral peaks closest to F0 and 2*F0
    * and report the amplitude difference between them (the H2-H1 amplitude difference).

Drugman et al. (2012) reported that H2H1 > 0dB is a reliable indicator of creaky voice.


Parameters
==========
    resid : ndarray
        A one-dimensional array of lpc residual samples, as produced by `phon.lpcresidual()`
    fs : int
        the sampling rate of the audio in **resid**.
    f0df : Dataframe
        A Pandas dataframe produced by one of the get_f0 functions 
    f0med : float, default = None
        Normally the median f0 is computed from the f0 values in f0df with the 
        line: `f0med = np.nanmedian(f0df.f0)`, but this parameter lets the user supply an 
        estimate of the median f0 explicitly.
    use_ac : boolean, default = True
        Following Drugman et al. (2012), the function by default finds H1 and H2 in the spectrum 
        of the autocorrelation function.
        
Returns
=======
    df : Dataframe
        The input dataframe `f0df` is returned with a new column `h2h1`

References
==========
    T. Drugman, J. Kane, C. Gobl (2012) Resonator-based Creaky Voice Detection. INTERSPEECH 2012, ISCA's 13th Annual Conference Portland, OR, USA, September 9-13, 2012

Example
=======
    .. code-block:: Python
    
        example_file = importlib.resources.files('phonlab') / 'data/example_audio/sf3_cln.wav'
        x,fs = phon.loadsig(example_file,chansel=[0])

        x,fs = phon.prep_audio(x, fs, target_fs=16000, pre = 0, quiet=True)  # resample, scale
        resid,fs = phon.lpcresidual(x,fs)
        f0df = phon.get_f0_ac(x,fs)

        df = h2h1(resid,fs,f0df)  # <- using information from lpcresidual(), and get_f0_ac()

        # ---- Now plot the results -------
        ret = phon.sgram(x,fs,cmap="Grays") # draw a spectrogram of the sound
        ax2 = ret[0].twinx() 
        ax2.plot(df.sec,df.h2h1,'bd')
        ax2.axhline(0,color="red")  # Drugman et al. use 0dB as the threshold for creaky voice

    .. figure:: images/h2h1.png
        :scale: 33 %
        :alt: a spectrogram with a trace of the h2h1 ratio calculated by phon.h2h1()
        :align: center

        A trace of the h2h1 ratio as calculated from the LPC residual signal by phon.h2h1().  The red
        line at 0 dB marks a proposed threshold for classifying vocal fold vibration as creaky versus modal.

        
    '''
    # get analysis window length and step size from the input f0 dataframe
    l = np.round(f0df['sec'][0]*2, 3) # to the nearest ms
    s = np.round(f0df['sec'][1] - f0df['sec'][0], 3)  # to the nearest ms
    
    step = int(fs * s)  # number of samples between frames
    frame_length = int(fs * l)
    half_frame = round(frame_length/2)
    frame_length = half_frame * 2 + 1    # odd number in frame  
    window = windows.hann(frame_length)

    if f0med==None:
        f0med = np.nanmedian(f0df.f0)

    bandwidth = 150  # narrow band around the median f0
    b,a = iirpeak(f0med,f0med/bandwidth,fs=fs)
    res_n = filtfilt(1,a,resid)  # narrowband filtered residual
    res_n = res_n/np.max(res_n)

    frames = frame(res_n,frame_length=frame_length, hop_length=step,axis=0)
    F0 = np.nan_to_num(f0df.f0,nan=f0med).astype(int)  # f0 values passed into the function

    # ------- autocorrelations of all of the frames in the file -----------
    if use_ac:    
        N = 1024
        while (frame_length+frame_length//2 > N): 
            N = N * 2  # increase fft size if needed

        Sxx = fft.fft(windows.hann(frame_length)*frames,N)
        ra = fft.fft(np.square(np.abs(Sxx)),N)  # matrix of autocorrelations
        ra = np.divide(ra.T,np.max(ra,axis= -1)).T  # frame by frame normalization

        spec = fft.fft(windows.blackmanharris(N)*ra,fs)  # spectrum of the autocorrelation
    else:
        spec = fft.fft(windows.hann(frame_length)*frames,fs) # spectrum of the filtered residual
        
    spec = np.abs(spec[:,0:int(fs/2)-1])
    spec = np.divide(spec.T,np.sqrt(np.sum(spec**2,axis=-1))).T
    spec = 20 * np.log10(spec)
    h2h1 = np.empty(F0.shape[0])
    #h2h1 = np.diagonal(spec[:,F0*2]) - np.diagonal(spec[:,F0])  
    for n in range(F0.shape[0]):
        peaks,props = find_peaks(spec[n,0:1000],distance = F0[n]/2)
        peak1 = int(np.argmin(np.fabs(peaks-F0[n])))
        peak2 = int(np.argmin(np.fabs(peaks-(F0[n]*2))))
        h2h1[n] = spec[n,peaks[peak2]] - spec[n,peaks[peak1]]

    df = f0df.copy()
    df['h2h1'] = h2h1
    return df