Source code for phonlab.acoustic.shs

from ..utils.prep_audio_ import prep_audio

import numpy as np
from librosa import util
from scipy.interpolate import interp1d
from numpy.fft import rfft, rfftfreq
from pandas import DataFrame
import matplotlib.pyplot as plt

def _maxarg(x,axis = -1):
    try:
        idx = np.argmax(x,axis=axis)
        val = np.max(x,axis=axis)
    except:
        idx = 0
        val = np.nan
        
    return idx,val


[docs]
def get_f0_shs(x,fs, f0_range=[50,400], l=0.06, s=0.005, shr_threshold = 0.3, target_time=None):
    ''' A pitch determination function implementing the subharmonic summation (SHS) algorithm proposed by Dik Hermes (1988), with a method of finding the relative amplitude of a subharmonic pitch (if there is evidence of a subharmonic) as suggested by Xuejing Sun (2002). The method is good at tracking pitch changes when the voice goes into creak, and is relatively insensitive to the setting of the f0_range parameter.  If you find that there is pitch halving, try increasing the shr_threshold, and if you find pitch doubling try decreasing shr_threshold and/or the top end of the f0_range.

    
Parameters
==========
    x : ndarray
        A one-dimensional array of audio samples
    fs : int
        Sampling rate of **x**
    f0_range : an array of two numbers, default=[50,400]
        The expected range for f0.  There is rarely any need to adjust the lower value.
    l : float, default = 0.06
        Length of analysis windows.  The default is 60 milliseconds.
    s : float, default = 0.005
        Step size, of hops between analysis windows. The default is 5 milliseconds.
    shr_threshold : float, default = 0.3
        A value which determines how sensitive the algorithm is in deciding that a subharmonic 
        component is present in the voicing spectrum. 
    target_time : float, default = None
        If a time value (in seconds) is given, a diagnostic figure comparable to Sun's Figure 1 
        will be produced for the frame at time target_time.

Returns
=======
    df: pandas DataFrame  
        measurements at (by default) 5 msec intervals.

Note
====
The columns in the returned dataframe are for each frame of audio:
    * sec - time at the midpoint of each frame in seconds
    * f0 - estimate of the fundamental frequency in Hz
    * rms - rms amplitude in the frequency band from 0 to fs/2
    * shr - The Subharmonic-to-harmonic Ratio.  If the ratio is low the subharmonic energy is close to that of the harmonic.
    
References
==========
    D. Hermes (1988) Measurement of pitch by subharmonic summation. `J. Acoust. Soc. Am.`, 83(1),257-264.
    
    X. Sun (2002) Pitch determination and voice quality analysis using subharmonic-to-harmonic ratio. `Proceedings of ICASSP2002` I-333 - I-336.

Example
=======

.. code-block:: Python

    example_file = importlib.resources.files('phonlab') / 'data/example_audio/sf3_cln.wav'
    x,fs = phon.loadsig(example_file,chansel=[0])
    ttime = 2.101

    df = phon.get_f0_shs(x,fs, shr_threshold=0.2, target_time=2.1)

This example shows diagnostic plots from the get_f0_shs() function. The top left panel shows the spectrum at a particular time in the audio file on a log2 scale.  The top right and bottom left panels show harmonic and subharmonic summation spectra which are used to produce the difference spectrum in the bottom right.  The maximum of the difference spectra is taken as the pitch in this frame.
 
.. figure:: images/shs.png
    :scale: 60 %
    :alt: Diagnostic plots of the log spectrum to 1200Hz, and the harmonic and submonic spectra, and in the bottom left panel shows the difference between them.
    :align: center  

    Diagnostic plots from shr_pitch().


.. figure:: images/shs_pitch.png
    :scale: 35 %
    :alt: Pitch trace produced by the get_f0_shs() function.
    :align: center  

    Pitch trace produced by get_f0_shs() function.

    '''
    y,fs = prep_audio(x,fs,target_fs=16000,quiet=True)

    top_freq = f0_range[1]*2.5
    
    frame_length = round(l*fs)
    half_frame = frame_length//2
    step = round(s*fs)

    if target_time != None:  # ---- for the diagnostic print -------
        fn = int(((target_time * fs)-half_frame)/step) # frame number
    else:
        fn = None
    
    NFFT = int(2**(np.ceil(np.log(frame_length*(2))/np.log(2))))
    
    freq = rfftfreq(NFFT, d=1./fs)
    limit = np.argmin(np.abs(freq-top_freq))+1  
    freq = freq[1:limit]   # frequency values in the linear spectrum
    logf = np.log2(freq)   # frequency values in the log2 spectrum
    minbin = logf[-1] - logf[-2]  # minimum step size on the log freq scale


    ilogf = np.arange(logf[0],logf[-1],minbin)  # equal spacing on the log scale
    ifreq = 2**ilogf

    maxlogf = np.log2(f0_range[1])
    minlogf = np.log2(f0_range[0])
    upperbound = np.argmin(np.abs(ilogf-maxlogf))
    lowerbound = np.argmin(np.abs(ilogf-minlogf))

    N = int(top_freq/f0_range[0])  # number of harmonics to consider
    N = N - N%2                    # make sure it is an even number

    frames = util.frame(y,frame_length=frame_length, hop_length=step,axis=0)
    nb = frames.shape[0]
    f = frames.shape[1]  # number of frequency steps
    
    w = np.blackman(frame_length)
    S = np.abs(rfft(w*frames,NFFT))  # spectrogram below topfreq
    Sxx = S[:,1:limit]
    rms = 20 * np.log10(np.sqrt(np.sum(np.square(np.abs(Sxx)),axis=-1))) 
    interp_function = interp1d(logf, Sxx) 
    logS = interp_function(ilogf)
    W = 0.5 + (1/np.pi)*np.arctan(5*(ilogf-np.log2(60)))  # W function (Hermes, 1988) to damp freqs below 60
    logS = W * logS
    #logS -= np.min(logS,axis=0)
    s_points = logS.shape[1]

    starts = np.round(np.log2(range(1,N+1,1))/minbin).astype(np.int16) 
    starts2 = np.round(np.log2(np.arange(1.5,N+1,1))/minbin).astype(np.int16) 
    
    shift_mat = np.zeros((nb,s_points,N))
    shift_mat2 = np.zeros((nb,s_points,N))

    for i in range(N):
        shift_mat[:,:s_points-starts[i],i] = logS[:,starts[i]:]
        shift_mat2[:,:s_points-starts2[i],i] = logS[:,starts2[i]:]
    Hxx = np.sum(shift_mat,axis=-1)  # subharmonic summation
    Pxx = np.sum(shift_mat2,axis=-1)  # shifted subharmonic summation
    Dxx = Hxx - Pxx
    
    idx1 = np.argmax(Dxx[:,lowerbound:upperbound],axis= -1) 
    idx1 += lowerbound
    mag1 = np.max(Hxx[:,idx1],axis=-1)
    mag1 = np.where(mag1>0,mag1,0.00001) # avoid 0/0 in SHR calculation
    
    # consider the possibility that idx1 is a subharmonic - look for harmonic peak at log2(f)+log2(1)
    step1 = round(np.log2(2 - 0.0625)/minbin) # search bounds for later, with subharmonics
    step2 = round(np.log2(2 + 0.0625)/minbin)
    s = idx1 + step1 # where to find the possible harmonic peak
    e = idx1 + step2
    top = min(s_points,upperbound)
    e = np.where(e>top,top,e)

    idx2,mag2 = np.array([_maxarg(Hxx[i,s[i]:e[i]]) for i in range(nb)]).T
    idx2 = (idx2 + s).astype(np.int16)

    idx2 = np.where(s>upperbound,idx1,idx2)
    mag2 = np.where(s>upperbound,mag1,mag2)
    mag2 = np.where(mag2<=0,mag1,mag2)

    SHR = (mag1-mag2)/(mag1+mag2)
    SHR = np.where(SHR==0,np.nan,SHR)
    f0idx = np.where(SHR<shr_threshold,idx2,idx1)
    F0 = ifreq[f0idx]
    
    ts = (np.array(range(nb)) * step + half_frame)/fs  # time axis for output

    if fn!=None:
        fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2,ncols=2)
        ax1.plot(ilogf,logS[fn,:],color="orange")
        ax2.plot(ilogf[lowerbound:upperbound],Hxx[fn,lowerbound:upperbound],color='black')
        ax3.plot(ilogf[lowerbound:upperbound],Pxx[fn,lowerbound:upperbound],color='black')
        ax3.plot(ilogf[lowerbound:upperbound],Hxx[fn,lowerbound:upperbound],color='black',linestyle="dotted")
        ax4.plot(ilogf[lowerbound:upperbound],Dxx[fn,lowerbound:upperbound],color='blue')

        ax2.axvline(ilogf[idx1[fn]],color="green",linestyle="dotted")
        ax2.axvline(ilogf[idx2[fn]],color="red",linestyle="dotted")
        ax3.axvline(ilogf[idx1[fn]],color="green",linestyle="dotted")
        ax3.axvline(ilogf[idx2[fn]],color="red",linestyle="dotted")
        ax4.axvline(ilogf[idx1[fn]],color="green",linestyle="dotted")
        ax4.axvline(ilogf[idx2[fn]],color="red",linestyle="dotted")

        print(f"F0={F0[fn]:0.1f}, shr={SHR[fn]:0.3f}, N={N}")
        print(f"   idx1={ifreq[idx1[fn]]:0.1f}, idx2={ifreq[idx2[fn]]:0.1f}, shr = {SHR[fn]:0.3f}")
        print(f"   mag1={mag1[fn]:0.3f}, mag2={mag2[fn]:0.3f}")

    return DataFrame({'sec': ts, 'f0': F0, 'rms':rms, 'shr': SHR})


def _shr_pitch(x,fs, f0_range=[64,400], l=0.04, s=0.005, shr_threshold = 0.15, top_freq = 1000, target_time=None):
    '''An implementation of Xuejing Sun's (2002) SubHarmonic-to-Harmonic Ratio (SHR) pitch determination method.  
This method is a variant of Hermes' (1988) subharmonic summation method.  In addition to computing pitch, the 
function returns the SHR when there is a measureable subhamonic component, otherwise the value is NaN.  
Sun found that between 40% and 90% of all voiced frames (depending on the speaker) had no measurable SHR. 

The algorithm as implemented here is three times faster than the fastest of the other pitch tracking algorithms 
in `phonlab` -- (the autocorrelation pitch tracker in `phonlab.get_f0`, and the cepstral peak pitch tracker in `phonlab.CPP`.  
It produces very accurate pitch determination, if the top of the F0 pitch range is chosen appropriately for the talker.
    
Parameters
==========
    x : ndarray
        A one-dimensional array of audio samples
    fs : int
        Sampling rate of **x**
    f0_range : an array of two numbers, default=[64,400]
        The expected range for f0.  Choose the top value deliberately for the speaker being analyzed.
    l : float, default = 0.04
        Length of analysis windows.  The default is 40 milliseconds.
    s : float, default = 0.005
        Step size, of hops between analysis windows. The default is 5 milliseconds.
    shr_threshold : float, default = 0.15
        A value which determines how sensitive the algorithm is in deciding that a subharmonic 
        component is present in the voicing spectrum. 
    top_frequency : int, default = 1000 (Hz)
        The spectrum will be limited to this top frequency
    target_time : float, default = None
        If a time value (in seconds) is given, a diagnostic figure comparable to Sun's Figure 1 
        will be produced for the frame at time target_time.

Returns
=======
    df: pandas DataFrame  
        measurements at (by default) 5 msec intervals.

Note
====
The columns in the returned dataframe are for each frame of audio:
    * sec - time at the midpoint of each frame in seconds
    * f0 - estimate of the fundamental frequency in Hz
    * shr - The Subharmonic-to-harmonic Ratio.  If the ratio is low the subharmonic energy is close to that in the harmonic spectrum.
    
References
==========
    D. Hermes (1988) Measurement of pitch by subharmonic summation. `J. Acoust. Soc. Am.`, 83(1),257-264.
    
    X. Sun (2002) Pitch determination and voice quality analysis using subharmonic-to-harmonic ratio. `Proceedings of ICASSP2002` I-333 - I-336.

Example
=======

.. code-block:: Python

    example_file = importlib.resources.files('phonlab') / 'data/example_audio/sf3_cln.wav'
    x,fs = phon.loadsig(example_file,chansel=[0])

    df = phon.shr_pitch(x,fs, f0_range=[50,300], target_time=1.33)


This example shows diagnostic plots from the shr_pitch() function. The top left panel shows the spectrum at a particular time in the audio file on a log2 scale.  The top right and bottom left panels show harmonic and subharmonic summation spectra which are used to produce the difference spectrum in the bottom right.  The maximum of the difference spectra is taken as the pitch in this frame.
 
.. figure:: images/shr.png
    :scale: 60 %
    :alt: Diagnostic plots of the log spectrum to 1000Hz, and the harmonic and submonic spectra, and in the bottom left panel shows the difference between them.
    :align: center  

    Diagnostic plots from shr_pitch().


.. figure:: images/shr_pitch.png
    :scale: 45 %
    :alt: Pitch trace produced by the shr_pitch() function.
    :align: center  

    Pitch trace produced by shr_pitch() function.

    '''
    y,fs = prep_audio(x,fs,target_fs=16000,quiet=True)

    frame_length = round(l*fs)
    half_frame = frame_length//2
    step = round(s*fs)

    if target_time != None:  # ---- for the diagnostic print -------
        fn = int(((target_time * fs)-half_frame)/step) # frame number
    else:
        fn = None
    
    NFFT = int(2**(np.ceil(np.log(frame_length*(1.5))/np.log(2))))
    
    freq = rfftfreq(NFFT, d=1./fs)
    limit = np.argmin(np.abs(freq-top_freq))+1  
    freq = freq[1:limit]   # frequency values in the linear spectrum
    logf = np.log2(freq)   # frequency values in the log2 spectrum
    minbin = logf[-1] - logf[-2]  # minimum step size on the log freq scale


    ilogf = np.arange(logf[0],logf[-1],minbin)  # equal spacing on the log scale
    ifreq = 2**ilogf

    maxlogf = np.log2(f0_range[1]/2)
    minlogf = np.log2(f0_range[0]/2)
    upperbound = np.argmin(np.abs(ilogf-maxlogf))
    lowerbound = np.argmin(np.abs(ilogf-minlogf))

    N = int(top_freq/f0_range[0])  # number of harmonics to consider
    N = N - N%2                    # make sure it is an even number

    frames = util.frame(y,frame_length=frame_length, hop_length=step,axis=0)
    nb = frames.shape[0]
    SHR = np.ones(nb) * np.nan
    F0 = np.ones(nb) * np.nan
    
    w = np.blackman(frame_length)
    S = np.abs(rfft(w*frames,NFFT))[:,1:limit]  # spectrogram below 1250Hz
    interp_function = interp1d(logf, S) 
    logS = interp_function(ilogf)
    W = 0.5 + (1/np.pi)*np.arctan(5*(ilogf-np.log2(60)))  # W function (Hermes, 1988) to damp freqs below 60
    logS = W * logS
    logS -= np.min(logS,axis=0)
    s_points = logS.shape[1]

    # 'odd' shift matrix log2(1,3,5,...), 'even' shift matrix log2(2,4,6...)
    odd_starts = np.round(np.log2(range(1,N,2))/minbin).astype(np.int16) 
    even_starts = np.round(np.log2(range(2,N+1,2))/minbin).astype(np.int16) 
    shift_mat_odd = np.zeros((nb,s_points,N//2))
    shift_mat_even = np.zeros((nb,s_points,N//2))

    for i in range(N//2):
        shift_mat_odd[:,:s_points-odd_starts[i],i] = logS[:,odd_starts[i]:]
        shift_mat_even[:,:s_points-even_starts[i],i] = logS[:,even_starts[i]:]
    Sub = np.sum(shift_mat_odd,axis=-1)
    Har = np.sum(shift_mat_even,axis=-1)
    DA = Har-Sub
    
    idx1,mag1 = _maxarg(DA[:,lowerbound:upperbound]) 
    idx1 += lowerbound

    # consider the possibility that idx1 is a subharmonic - look for harmonic peak at log2(f)+log2(1)
    step1 = round(np.log2(2 - 0.0625)/minbin) # search bounds for later, with subharmonics
    step2 = round(np.log2(2 + 0.0625)/minbin)
    s = idx1 + step1 # where to find the possible harmonic peak
    e = idx1 + step2
    #top = min(s_points,upperbound)
    e = np.where(e>s_points,s_points,e)
    
    idx2,mag2 = np.array([_maxarg(DA[i,s[i]:e[i]]) for i in range(nb)]).T
    idx2 = (idx2 + s).astype(np.int16)
    
    # if start > upperbound - then idx2,mag2 are invalid - f0 = ifreq[idx1]*2, SHR = 0
    # if mag1 < 0 - then f0 = np.nan, SHR = np.nan
    # if mag2 < 0 - then f0 = ifreq[idx1]*2, SHR = 0
   
    mask = (s<upperbound) & (mag1>0) & (mag2>0)   # constraints on calculating SHR
    SHR[mask] = (mag1[mask]-mag2[mask])/(mag1[mask]+mag2[mask])
    SHR = np.where(mag1<=0,np.nan,SHR)
    SHR = np.where(mag2<=0,0,SHR)
    F0 = np.where(mag1>0,ifreq[idx1]*2,np.nan)
    F0 = np.where(SHR<shr_threshold,ifreq[idx2]*2,F0)
    F0 = np.where(F0>f0_range[1],F0/2, F0)

    ts = (np.array(range(nb)) * step + half_frame)/fs  # time axis for output

    if fn!=None:
        fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2,ncols=2)
        ax1.plot(ilogf,logS[fn,:],color="orange")
        ax2.plot(ilogf,Har[fn,:],color='black')
        ax3.plot(ilogf,Sub[fn,:],color='black')
        ax3.plot(ilogf,Har[fn,:],color='gray',linestyle="dotted")
        ax4.plot(ilogf,DA[fn,:],color="dodgerblue")
        ax4.axvline(ilogf[lowerbound],linestyle="dashed")
        ax4.axvline(ilogf[upperbound],linestyle="dashed")
        ax4.axvline(ilogf[idx2[fn]],color="red",linestyle="dotted")
        ax4.axvline(ilogf[idx1[fn]],color="green",linestyle="dotted")
        print(f"SHR={SHR[fn]}, F0={F0[fn]}, N={N}, s={s[fn]}, e={e[fn]}, upperbound = {upperbound}")
        print(f"   (idx1={ifreq[idx1[fn]]}, mag1={mag1[fn]}), (idx2={ifreq[idx2[fn]]}, mag2={mag2[fn]})")

    return DataFrame({'sec': ts, 'f0': F0, 'shr': SHR})