Source code for phonlab.auditory.audspec

import librosa
import numpy as np
from numpy.polynomial import Polynomial
from scipy.fft import rfft
import scipy.stats as stats
from ..utils.prep_audio_ import prep_audio


[docs]
class Audspec(object):
    """ Create an an Audspec object; analysis parameters and routines for creating auditory spectrograms.

    Parameters
    ==========
    fs : int, default=16000
        The desired sampling rate of audio analysis, determines the frequency range of the auditory spectrogram.  Note that if the value given here exceeds the sampling rate of the file passed, there can be 'empty' space at the top of the auditory spectrogram.
    step_size : float, default=0.03
        The interval (in seconds) between successive analysis frames.

    Returns
    =======
    object: Audspec
        The object returned by the constructor function is ready for calls to functions like object.make_zgram(), object.make_sharpgram(), etc. to compute auditory representations of sound.

    Examples
    ========
    The examples here show the use of the `Audspec` class to create auditory representations of sounds, roughly based on the properties of the human cochlea and aspects of auditory processing in the brainstem.  The most basic representation is the auditory spectrogram (internally called the `zgram` and sometimes referred to as the `audiogram`), which is simply a critical band filtered spectrogram.

    .. code-block:: Python

         x,fs = phon.loadsig("sf3_cln.wav",chansel=[0])
         aud = phon.Audspec()
         aud.make_zgram(x,fs)
        
         # ---- the rest is to make a nice plot ----
         
         fig,ax = plt.subplots(2,figsize=(6,5))
         
         Hz_extent = (min(aud.time_axis), max(aud.time_axis),
               min(aud.fft_freqs), max(aud.fft_freqs))  # time and frequency values for sgram.
         ax[0].imshow(20*np.log10(aud.sgram.T),origin='lower', aspect='auto', interpolotion="spline36",
                  extent=Hz_extent, cmap = plt.cm.Greys)
         ax[0].set(xlabel="Time (sec)", ylabel="Frequency (Hz)")
         ax[1].imshow(aud.zgram.T,origin='lower', aspect='auto', interpolotion="spline36",
                  extent=aud.extent, cmap = plt.cm.Purples)
         ax[1].set(xlabel="Time (sec)", ylabel="Frequency (Bark)")
         fig.tight_layout()

    .. figure:: images/make_zgram.png
       :scale: 50 %
       :alt: An acoustic narrow band spectrogram and the auditory spectrogram of the same utterance.
       :align: center

       (top) Acoustic narrow band spectrogram, and (bottom) Auditory spectrogram of the same utterance.

    One type of secondary auditory processing sharpens the frequencies of the audiogram, simulating frequency selectivity through lateral inhibition.
    
    .. code-block:: Python

        aud.make_sharpgram()
        fig,ax = plt.subplots(1,figsize=(8,3))

        ax.imshow(aud.sharpgram.T,origin='lower', aspect='auto',
                 extent=aud.extent, interpolation="spline36", cmap = plt.cm.Reds)
        ax.set(xlabel="Time (sec)", ylabel="Frequency (Bark)")

    .. figure:: images/make_sharpgram.png
       :scale: 50 %
       :alt: The "sharpgram" auditory representation
       :align: center

       (top) The "sharpgram" auditory representation.

    Another auditory representation emphasizes temporal changes in the audiogram.  In the `Audspec` class this is referred to as a `tgram`.

    .. code-block:: Python

        aud.make_tgram()
        fig,ax = plt.subplots(1,figsize=(8,3))

        ax.imshow(aud.tgram.T,origin='lower', aspect='auto',
                 extent=aud.extent, interpolation="spline36", cmap = plt.cm.afmhot)
        ax.set(xlabel="Time (sec)", ylabel="Frequency (Bark)")

    .. figure:: images/make_tgram.png
       :scale: 50 %
       :alt: The "tgram" auditory representation, showing the gradient of change in critical bands
       :align: center

       (top) The "tgram" auditory representation, showing the gradient of change in critical bands.


    Methods and Attributes in an Audspec object
    ===========================================
    
    """

        
    def hz2bark(self, hz):
        return 7 * np.arcsinh(hz/650)

    def bark2hz(self, bark):
        return 650 * np.sinh(bark/7)

    def _make_cb_filters(self):
        """        
        Create and return 2d array of Patterson filters for DFT spectra based
        on attribute values in `self`.

        The returned filters are stored in an 2d array in which the
        rows represent the filter frequency bands in ascending order. The
        columns contain symmetrical filter coefficients as determined by the
        Patterson (1976) formula and centered at the filter frequency in the
        DFT spectrum. Filter coefficients outside the frequency band are set
        to 0.0.

        The one-sided length of filter coefficients for each band is stored
        in the `cbfiltn` attribute. The number of coefficients in the
        symmetrical filter for band `j` is therefore
        `(self.cbfiltn[j] * 2) - 1`. In a few bands this calculation might not
        be correct since the coefficients may not fit when the center frequency
        is near the left or right edge of the DFT spectrum. In such cases the
        coefficients are truncated, and the actual number of coefficients for
        the band `j` can be found with `np.sum(self.cbfilts[j] != 0.0)`.

        References
        ----------
        R. D. Patterson (1976) Auditory filter shapes derived with noise stimuli. `JASA` **59** , 640-54.
        """
        
        cbfilts = np.zeros([len(self.freqs), len(self.sgram)])
        dfreq = np.arange(self.maxcbfiltn) * self.inc
        cbfiltn = np.searchsorted(dfreq, self.freqs / 5)
        cbfiltn[cbfiltn > self.maxcbfiltn] = self.maxcbfiltn
        self.cbfiltn = cbfiltn
        for j, iidx in enumerate(cbfiltn):
            cbfilt = np.zeros(self.maxcbfiltn)
            bw = 10.0 ** ( (8.3 * np.log10(self.freqs[j]) - 2.3) / 10.0 )
            hsq = np.exp(-np.pi * ((dfreq[:iidx] / bw) ** 2))
            cbfilt[:iidx] = np.sqrt(hsq)
            cbfilt /= cbfilt[0] + np.sum(cbfilt[1:] * 2)

            # Make a symmetrical array of coefficients centered at loc.
            # [n, n-1, ..., 2, 1, 0, 1, 2, ... n-1, n]
            loc = (self.freqs[j] / self.inc).astype(int) # get location in dft spectrum
            left_n = iidx if iidx <= loc else loc
            right_n = iidx if loc + iidx < (self.dft_n / 2) else int(self.dft_n / 2) - loc
            coef = np.append(np.flip(cbfilt[:left_n])[:-1], cbfilt[:right_n])
            startidx = loc - left_n + 1
            endidx = loc + right_n
            cbfilts[j, startidx:endidx] = coef
        return cbfilts

    def _make_sgram(self, data, *args, **kwargs):
        '''
        Private function to make an acoustic spectrogram via rfft().
        And add equal loudness contour.

        Parameters
        ----------

        data: 1d array
        Audio data.

        kwargs: dict, optional
        Keyword arguments will be passed to the scipy.fft.rfft() function.

        Returns
        -------
        A tuple, consisting of:

        sgram: 2D array
            The acoustic spectrogram of shape (times, frequency bins).

        spect_times: 1D array
            The times of each spectral slice in `spect`.
        '''
        data = np.pad(data, [int(self.dft_n/2), int(self.dft_n/2)], 'constant')
        if (np.max(data) < 1):   # floating point but we need 16bit int range
            data = (data*(2**15)) #.astype(np.int32)

        hop = int(self.step_size * self.fs)
        frames = librosa.util.frame(data,frame_length=self.dft_n,hop_length=hop
            ).transpose()
        t = librosa.frames_to_time(np.arange(frames.shape[0]),
            sr=self.fs,hop_length=hop,n_fft=self.dft_n)
        # Add some noise, then scale frames by the window.
        frames = (frames + np.random.normal(0, 1, frames.shape)) * np.hamming(self.dft_n)
        A = 2/self.dft_n * rfft(frames, **kwargs)
        sgram = (np.abs(A)).astype(self.sgram.dtype)
        return (sgram, t)




[docs]
    def make_zgram(self, x, fs, target_fs=16000, preemph = 0, **kwargs):
        '''Make an auditory spectrogram by creating an acoustic spectrogram and then applying critical-band filters to it, using the filter shapes described by Patterson (1976).  The function creates the auditory spectrogram and stores it in `self.zgram`.

Parameters
----------

x : ndarray
    Audio data as a one dimensional numpy array 

fs : int, default = 16000
    the sampling rate of the audio in **x**.

preemp : float, default = 1.0
    The amount of preemphasis to apply before filtering.

**kwargs: dict, optional
    Keyword arguments to be passed to scipy.fft.rfft() 


References
----------

R. D. Patterson (1976) Auditory filter shapes derived with noise stimuli. `J. Acoust. Soc. Am.` **59** , 640-54.
        '''
        
        x, fs = prep_audio(x, fs = fs, target_fs = self.fs, pre = preemph)
        (sgram, self.time_axis) = self._make_sgram(x, kwargs)
        self.sgram = sgram + self.loud
        zgram = self.sgram[:, np.newaxis, :] * self.cbfilts[np.newaxis, :, :]
        self.zgram = 20 * np.log10(zgram.sum(axis=2)+1)
        self.extent = (min(self.time_axis),max(self.time_axis),min(self.zfreqs),max(self.zfreqs)) 

        
    def create_sharp_filter(self, span=4, mult=3, dimension="frequency"):
        if (dimension=="frequency"):  # default value
            steps = int(span / self._zinc)
        else:  # otherwise assume temporal sharpening
            steps = int(span / self.step_size)
            
        if steps % 2 == 0:
            steps += 1
        sharp = np.full(steps, -1.0)
        mid = int(steps / 2)
        sharp[mid] = steps * mult
        sharp /= sharp.sum()
        return sharp 
     

    def create_blur_filter(self, span=3, sigma=1.5):
        steps = int(span / self._zinc)
        if steps % 2 == 0:
            steps += 1
        mid = int(steps / 2)
        blur = 1 / (np.sqrt(np.pi*2) * sigma) * \
            np.exp(((np.arange(-mid, mid+1) ** 2) * -1) / (2 * sigma**2))
        blur /= blur.sum()
        return blur

    def apply_filt(self, gram, filt, axis=0, half_rectify=True):
        # Make the axis to act on the first dimension if required.
        if axis == 1:
            gram = gram.transpose()

        # Do convolution along the first dimension.
        agram = np.zeros_like(gram)
        mid = (len(filt) - 1) / 2
        for j in np.arange(gram.shape[0]):
            agram[j] = np.convolve(
                np.pad(gram[j], int(mid), mode='edge'),
                filt,
                mode='valid'
            )

        # Do half-wave rectification if requested.
        if half_rectify is True:
            agram[agram < 0] = 0

        # Rescale spectrogram values as relative magnitude.
        agram = (agram - np.min(agram)) / (np.max(agram) - np.min(agram))

        if axis == 1:
            return agram.transpose()
        else:
            return agram


[docs]
    def make_sharpgram(self,span=6, mult=1, dimension = "frequency"):
        '''Sharpens the frequency distinctions or temporal dimension in the auditory spectrogram and stores the resulting sharpened spectrogram in the class property `self.sharpgram`. Note that `make_zgram()` must be called before calling this function.

        Parameters
        ----------

        span : scalar, default = 6
            The time (in seconds) or frequency (in Bark) range of the filter

        mult : scalar, default = 1
            The degree of sharpening, larger value gives more contrast

        dimension : string, default = "frequency"
            For sharpening in the "frequency" domain or the "time" domain.


        '''
        if len(self.zgram.shape)==1:
            print("Call make_zgram() before calling make_sharpgram()")
            return
            
        sharpen = self.create_sharp_filter(span, mult,dimension)
        self.sharpgram = self.apply_filt(self.zgram, sharpen, axis=0, half_rectify=True)  # frequency sharpening



[docs]
    def make_blurgram(self,span=3, sigma=1.5):
        '''Blur the frequency contrasts in the auditory spectrogram using a 1d Gaussian blur filter.  The resulting blurred auditory spectrogram is stored in the class property `self.blurgram`. Note that `make_zgram()` must be called before calling this function.

        Parameters
        ----------

        span: scalar, default = 3
            Frequency range, in Bark, over which the filter blurs

        sigma: scalar, default = 1.5
            The variance of the Gaussian function

        '''
        
        if len(self.zgram.shape)==1:
            print("Call make_zgram() before calling make_blurgram()")
            return
            
        blur = self.create_blur_filter(span, sigma)
        self.blurgram = self.apply_filt(self.zgram, blur, axis=0, half_rectify=True)  # frequency blurring



[docs]
    def make_tgram(self):
        '''Compute the change in energy in each critical band in the auditory spectrogram.  The tgram is positive when the amplitude is increasing, and negative when the amplitude in a critical band is decreasing.  The resulting temporal contrast auditory spectrogram is stored in the class property `self.tgram`.  Note that `make_zgram()` must be called before calling this function.

        '''

        if len(self.zgram.shape)==1:
            print("Call make_zgram() before calling make_tgram()")
            return
        self.tgram = stats.zscore(np.gradient(self.zgram,axis=0),axis=0)  # temporal change




[docs]
    def savez(self, fname, **kwargs):
        '''Calls numpy.savez to save all of the properties of the audspec object.

        Parameters
        ----------

        fname : string
            Name of the file in which to save the data.  Should end in ".npz"

        '''
        
        np.savez(
            fname,
            **self.__dict__,
            **kwargs,
            **{'custom_vars': list(kwargs.keys())}
        )

    
    def __init__(self, fs=16000, step_size=0.03):
        float_t=np.float32
        int_t=np.int32
        super(Audspec, self).__init__()
        self.fs = fs 
        self.dft_n = 2**(int_t(np.log2(0.05*fs)))  # choose fft size based on fs
        spect_points = int_t(self.dft_n/2) + 1
        self._topbark = self.hz2bark(self.fs/2.0)
        self.ncoef = int_t(self._topbark * 3.5)  # number of points in the auditory spectrum
        self._zinc = self._topbark/(self.ncoef+1)
        self.inc = self.fs/self.dft_n;   # get hz stepsize in fft
        self.fft_freqs = np.arange(1, spect_points+1) * self.inc

        self.sgram = np.zeros(spect_points, dtype=float_t) #: ndarray - 2d acoustic narrow band spectrogram
        self.zgram = np.zeros(self.ncoef, dtype=float_t) #: ndarray - 2d auditory spectrogram
        self.sharpgram = np.zeros(self.ncoef,dtype=float_t)  #: ndarray - frequency sharpened auditory spectrogram
        self.blurgram = np.zeros(self.ncoef,dtype=float_t) #: ndarray - blurred auditory spectrogram
        self.tgram = np.zeros(self.ncoef,dtype=float_t) #: ndarray - temporal contrast auditory spectrogram
        
        self.zfreqs = np.arange(1, self.ncoef+1) * self._zinc #: ndarray - Center frequencies of the critical bands in Bark
        self.freqs = self.bark2hz(self.zfreqs) #: ndarray - Center frequencies of the critical bands in Hz
        self.step_size = float_t(step_size)  #: temporal interval between frames in sec
        self.time_axis = np.zeros(0, dtype=float_t)  #: ndarray - time axis for auditory spectrogram
        self.extent = [0,0,0,0] #: ndarray - [xmin,xmax,ymin,ymax] plotting limits of the auditory spectrogram for imshow()

        
        self.maxcbfiltn = int_t(100)  # number of points in biggest CB filter
        self.cbfilts = self._make_cb_filters().astype(float_t)
        loudpoly = Polynomial([22.57, -11.46, -52.58, 226.97, 41.05, -1415.86, 
                 925.53, 5216.88, -5157.93, -10245.93, 11386.57, 
                 9702.65, -11213.73, -3484.65, 4079.037], domain=[20,20000])
        self.loud = (10.0**(-loudpoly(self.fft_freqs)/10.0)).astype(float_t)