Source code for phonlab.acoustic.cepstral

from ..utils.prep_audio_ import prep_audio
from ..acoustic.sgram_ import compute_sgram
import numpy as np
from librosa import util, stft, amplitude_to_db, frames_to_time
from scipy import fft
from pandas import DataFrame
from scipy.signal import windows,filtfilt
from sklearn.linear_model import LinearRegression
from scipy.ndimage import gaussian_filter


[docs]
def compute_cepstrogram(x,fs, dBscale=True, l= 0.04, s=0.005):
    '''Compute a `cepstrogram` of an audio signal.  Cepstral analysis was introduced by Bogert et al. (1963).  
Sounds (such as voiced vowels) that have a harmonic structure, with peaks of amplitude at integral multiples of
the fundamental frequency will have a strong peak in the cepstrum at the period of the fundamental.  The cepstrum
is computed as the spectrum of the spectrum to discover the harmonic structure of the spectrum.  

Parameters
==========
    x : ndarray
        A one-dimensional array of audio samples
    fs : int
        Sampling rate of **x**
    dBscale : boolean, default = True
        Scale the cepstrum in dB
    l : float, default = 0.04
        Length of analysis windows.  The default is 40 milliseconds.
    s : float, default = 0.005
        Step size, of hops between analysis windows. The default is 5 milliseconds.  

Returns
=======
    quef : ndarray
        A one dimensional numpy array with Quefrency values (in ms)
    sec : ndarray
        A one dimensional numpy array with frame time points (in seconds)
    Sxx : ndarray
        A two dimensional numpy array with cepstral magnitudes at these timepoints and quefrencies.


References
==========
    B. P. Bogert, M. J. R. Healy, and J. W. Tukey, (1963) `The Quefrency Alanysis [sic] of Time Series for Echoes: Cepstrum, Pseudo Autocovariance, Cross-Cepstrum and Saphe Cracking, `Proceedings of the Symposium on Time Series Analysis` (M. Rosenblatt, Ed) Chapter 15, 209-243. New York: Wiley.

    '''
    frame_length = int(l*fs)
    step = int(s*fs)
    half_frame = round(frame_length/2)
    frame_length = half_frame * 2 + 1    # odd number in frame
    NFFT = int(2**(np.ceil(np.log(frame_length)/np.log(2))))
    quef = (np.array(range(NFFT//2))/fs *1000)  # the quefrecy axis of the cepstra (in ms)
    frames = util.frame(x,frame_length=frame_length, hop_length=step,axis=0)

    nb = frames.shape[0]
    f = frames.shape[1]
    w = windows.hann(frame_length)
    
    Sxx = 10 * np.log10(np.abs(fft.rfft(w*frames,NFFT)))
    Sxx2 = np.abs(fft.rfft(Sxx,NFFT))   # spectrum of the spectrum -- cepstrum
    if (dBscale):
        Ceps = 10 * np.log10(Sxx2[:,:-1])
    else:
        Ceps = np.log(Sxx2[:,:-1])
        
    ts = (np.array(range(nb)) * step + half_frame)/fs

    return(quef, ts, Ceps)



[docs]
def CPP(x,fs, target_fs = 16000, smooth=2, norm=True, dBscale=True, f0_range = [60,400], l= 0.04, s=0.005):
    '''Measure Cepstral Peak Prominence - an acoustic measure that has been shown to be highly correlated with 
perceived breathy vocal quality (Hillenbrand & Houde, 1996).  This implementation drew inspiration and ideas from 
John Kane's cpp() matlab function in the `covarep` repository.

Parameters
==========
    x : ndarray
        A one-dimensional array of audio samples
    fs : int
        Sampling rate of **x**
    target_fs : int, default = 16000
        Sampling rate for the analysis algorithm.
    smooth : float, default = 2
        The sigma value of a Gaussian filter that smooths the cepstrogram in both frequency and time
    norm : boolean, default = True
        Flag to request that the cepstral peak prominence be normalized for overall amplitude, using a linear fit 
        to the cepstrum and measuring the height of the peak above the fit line.
    dBscale : boolean, default = True
        Scale the cepstrum in dB
    f0_range : an array of two numbers, default=[50,500]
        The expected range for f0.
    l : float, default = 0.04
        Length of analysis windows.  The default is 40 milliseconds.
    s : float, default = 0.005
        Step size, of hops between analysis windows. The default is 5 milliseconds.  If smoothing==True, the step size
        is reduced to 0.002 (2 millisecond).
    return_Sxx : boolean, default = False
        flag to request to return the cepstrogram (produced by `compute_cepstrogram()`) instead of returning an analysis 
        dataframe.  If smoothing is requested the smoothed cepstrogram is returned.  The return values (instead of the 
        dataframe described below) are then:
        
            * quef - A one dimensional numpy array with Quefrency values (in ms)
            * sec - A one dimensional numpy array with frame time points (in seconds)
            * Sxx - A two dimensional numpy array with cepstral magnitudes at these timepoints and quefrencies.

Returns
=======
    df: pandas DataFrame  
            measurements at 5 msec intervals.

Note
====
The columns in the returned dataframe are for each frame of audio:
    * sec - time at the midpoint of each frame in seconds
    * f0 - estimate of the fundamental frequency in Hz
    * cpp - cepstral peak prominence in dB

Note
====
The default parameter values are an implementation of Hillenbrand and Houde (1996), for the measurement of breathy voice.  
Lower values of cepstral peak prominence are associated with breathy phonation.  Following H&H, the algorithm 
uses smoothing both across time and quefrency to improve the identification of the cepstral peak that corresponds to the 
period of the voicing fundamental.


References
==========
    J. Hillenbrand, R.A. Houde (1996) Acoustic Correlates of Breathy Vocal Quality: Dysphonic Voices and Continuous Speech `Journal of Speech and Hearing Science Research`, 39, 311-321.

    B. P. Bogert, M. J. R. Healy, and J. W. Tukey, (1963) The Quefrency Alanysis [sic] of Time Series for Echoes: Cepstrum, Pseudo Autocovariance, Cross-Cepstrum and Saphe Cracking, `Proceedings of the Symposium on Time Series Analysis` (M. Rosenblatt, Ed) Chapter 15, 209-243. New York: Wiley.

Example
=======
This example plots the cepstral peak prominence through the "I'm twelve" example recording.  Note the increase in breathiness (decrease in cpp in each of the first two words "I'm" and "twelve".

.. code-block:: Python

    example_file = importlib.resources.files('phonlab') / 'data/example_audio/im_twelve.wav'
    x,fs = phon.loadsig(example_file,chansel=[0])

    cppdf   = phon.CPP(x,fs)
    cpp,s,_ = phon.smoothn(cppdf.cpp,s=35,isrobust=True)  # smooth the cpp curve

    ret = phon.sgram(x,fs,cmap="Blues") # draw the spectrogram from the array of samples
    ax2 = ret[0].twinx()
    ax2.plot(cppdf.sec, cpp, 'r-')
    plt.ylabel("Cepstral Peak Prominence (dB)")

.. figure:: images/cpp.png
    :scale: 40 %
    :alt: Spectrogram, with cepstral peak prominence curve overlaid.
    :align: center  

    '''
    y,fs = prep_audio(x, fs, target_fs=target_fs, pre=0.0, quiet=True)  # resample to 16kHz

    if smooth: s = 0.002  # faster framerate if smoothed

    sec, quef, Sxx = compute_cepstrogram(y, fs, dBscale, l, s)
    
    if smooth:
        Sxx = gaussian_filter(Sxx,sigma = smooth,truncate=3)
        
    Sxx = np.nan_to_num(Sxx) # replaces NaN with 0

    sT = int(np.round(fs/f0_range[1]))  # the shortest expected pitch period
    lT = int(np.round(fs/f0_range[0])) # the longest expected pitch period
    
    cp = np.argmax(Sxx[:,sT:lT],axis=-1) + sT
    cpp = np.max(Sxx[:,sT:lT],axis=-1)
    f0 = 1/(cp/fs)

    if norm:
        # hard coding here the range for the linear regression CPP normalization
        sT = int(np.round(fs/500)) # was [300,60] in Hillenbrand & Houde
        lT = int(np.round(fs/50)) # was [300,60] in Hillenbrand & Houde
        X = np.array(np.arange(sT,lT,1))  # line fitting in the f0 region
        X = np.reshape(X,(len(X),1))
        reg =LinearRegression().fit(X,y=Sxx[:,sT:lT].T)  # vectorized regressions
        p = np.diag(reg.predict(np.reshape(cp,(-1,1))))
        cpp = cpp-p
            
    return DataFrame({'sec': sec, 'f0':f0, 'cpp':cpp})




[docs]
def cepstral_smooth(x, fs, tf=5000, s=0.005, l=0.04, preemphasis=0.0, lift=0.003):
    '''Smooth spectra by cepstral 'liftering' to remove the periodic source, leaving only the overall spectral envelope shape. 
The implementation here borrows heavily from Seonju Kim's repository (https://github.com/hwang9u/pyceps).  

Parameters
==========
    x : ndarray
        A one-dimensional array of audio samples
    fs : int
        Sampling rate of **x**
    tf : int, default = 5000
        The top frequency (in Hz) of the smoothed spectrum.  The audio will be resampled to the sampling rate `tf` * 2.
    s : float, default = 0.005
        Step size, of hops between analysis windows. The default is 5 milliseconds.  
    l : float, default = 0.04
        Length of analysis windows.  The default is 40 milliseconds.  The actual size of the analysis windows will 
        be the smallest power of 2 that is greater than or equal to this length in samples.
    lift : float, default = 0.003
        The filtering coefficient (in seconds).  Compenents that have a period longer than the `lift` value will be 
        removed from the signal.  The default value of 3 ms means that cepstral components lower than 333Hz will be
        removed.  Smaller values of this parameter lead to more smoothing, and larger values lead to less spectral 
        smoothing.

Returns
=======
    sec : ndarray
        A one dimensional numpy array with frame time points (in seconds)
    freqE : ndarray
        A one dimensional numpy array with frequency values (in Hz) for the smoothed spectral envelope, Exx.
    Exx : ndarray
        A two dimensional numpy array with cepstrally smoothed spectral magnitudes at these timepoints and frequencies.  The spectrum at time point `i` is found at Exx[:,i].
    freqS : ndarray
        The frequency axis for the raw spectral array, Sxx.
    Sxx : nd array
        A two dimensional numpy array with raw spectral magnitudes, before smoothing.  A narrow band spectrogram.
    quef : nd array
        A one dimensional numpy array with quefrency values (in ms) corresponding to axis 1 of the cepstra array, Cxx.
    Cxx : nd array
        A two dimensional numpy array of cepstra at the time points in sec.


References
==========
    B. P. Bogert, M. J. R. Healy, and J. W. Tukey, (1963) `The Quefrency Alanysis [sic] of Time Series for Echoes: Cepstrum, Pseudo Autocovariance, Cross-Cepstrum and Saphe Cracking, `Proceedings of the Symposium on Time Series Analysis` (M. Rosenblatt, Ed) Chapter 15, 209-243. New York: Wiley.

.. code-block:: Python

    import matplotlib.pyplot as plt

    audio_dir = importlib.resources.files('phonlab') / 'data' / 'example_audio'
    example_file = audio_dir / 'sf3_cln.wav'
        
    x,fs = phon.loadsig(example_file,chansel=[0])

    freq, ts, Exx, Sxx, quef, Cxx = phon.cepstral_smooth(x, fs, lift=0.0022)

    # ------ plot spectra at a particular time point -------

    slice_time = 1.4
    i = np.argmin(np.abs(ts-slice_time))  # find the index of the spectral slice

    plt.plot(freq,Exx[:,i], color='blue')
    plt.plot(freq1,Sxx[:,i],alpha=0.2,color='gray')


.. figure:: images/cepstral_smooth.png
    :scale: 50 %
    :alt: A ceptrally smoothed spectrum taken at time 1.4 in file sf3_cln.wav
    :align: center

    A ceptrally smoothed spectrum taken at time 1.4 in file sf3_cln.wav, in comparison with the unsmoothed FFT spectrum.

    '''
    target_fs = int(tf*2)
    
    x,fs = prep_audio(x, fs, target_fs=target_fs, pre=preemphasis, quiet=False)  # resample to target_fs

    step = int(fs*s)
    frame_length = int(fs*l)
    half_frame = frame_length//2

    ts,freqS,Sxx = compute_sgram(x,fs,l,s)
                              
    Cxx = np.apply_along_axis(func1d=lambda xx: np.fft.irfft(xx).real, axis=0, arr=Sxx)   # cepstrogram
    quef = np.arange(Cxx.shape[0])/fs

    lift = int(lift * fs)  # convert lift value to samples (from seconds)
    Exx = Cxx.copy()
    Exx = np.concatenate(  (Exx, Exx[1:][::-1]), axis=0 )   # mirror spectrum
    Exx[lift:-lift, :] = 0.     # filter, set all to zero exept below lift period (shorter periods only)
    Exx = np.apply_along_axis(func1d=lambda xx: np.fft.rfft(xx).real[:Cxx.shape[0]], 
                              axis=0, arr=Exx)  # back to frequency axis
    freqE = np.arange(1,Exx.shape[0]+1)*(fs/2)/Exx.shape[0]                       # frequency axis

    return ts, freqE, Exx, freqS, Sxx, quef, Cxx