import librosa
import numpy as np
from numpy.polynomial import Polynomial
from scipy.fft import rfft
import scipy.stats as stats
from ..utils.prep_audio_ import prep_audio
[docs]
class Audspec(object):
""" Create an an Audspec object; analysis parameters and routines for creating auditory spectrograms.
Parameters
==========
fs : int, default=16000
The desired sampling rate of audio analysis, determines the frequency range of the auditory spectrogram. Note that if the value given here exceeds the sampling rate of the file passed, there can be 'empty' space at the top of the auditory spectrogram.
step_size : float, default=0.03
The interval (in seconds) between successive analysis frames.
Returns
=======
object: Audspec
The object returned by the constructor function is ready for calls to functions like object.make_zgram(), object.make_sharpgram(), etc. to compute auditory representations of sound.
Examples
========
The examples here show the use of the `Audspec` class to create auditory representations of sounds, roughly based on the properties of the human cochlea and aspects of auditory processing in the brainstem. The most basic representation is the auditory spectrogram (internally called the `zgram` and sometimes referred to as the `audiogram`), which is simply a critical band filtered spectrogram.
.. code-block:: Python
x,fs = phon.loadsig("sf3_cln.wav",chansel=[0])
aud = phon.Audspec()
aud.make_zgram(x,fs)
# ---- the rest is to make a nice plot ----
fig,ax = plt.subplots(2,figsize=(6,5))
Hz_extent = (min(aud.time_axis), max(aud.time_axis),
min(aud.fft_freqs), max(aud.fft_freqs)) # time and frequency values for sgram.
ax[0].imshow(20*np.log10(aud.sgram.T),origin='lower', aspect='auto', interpolotion="spline36",
extent=Hz_extent, cmap = plt.cm.Greys)
ax[0].set(xlabel="Time (sec)", ylabel="Frequency (Hz)")
ax[1].imshow(aud.zgram.T,origin='lower', aspect='auto', interpolotion="spline36",
extent=aud.extent, cmap = plt.cm.Purples)
ax[1].set(xlabel="Time (sec)", ylabel="Frequency (Bark)")
fig.tight_layout()
.. figure:: images/make_zgram.png
:scale: 50 %
:alt: An acoustic narrow band spectrogram and the auditory spectrogram of the same utterance.
:align: center
(top) Acoustic narrow band spectrogram, and (bottom) Auditory spectrogram of the same utterance.
One type of secondary auditory processing sharpens the frequencies of the audiogram, simulating frequency selectivity through lateral inhibition.
.. code-block:: Python
aud.make_sharpgram()
fig,ax = plt.subplots(1,figsize=(8,3))
ax.imshow(aud.sharpgram.T,origin='lower', aspect='auto',
extent=aud.extent, interpolation="spline36", cmap = plt.cm.Reds)
ax.set(xlabel="Time (sec)", ylabel="Frequency (Bark)")
.. figure:: images/make_sharpgram.png
:scale: 50 %
:alt: The "sharpgram" auditory representation
:align: center
(top) The "sharpgram" auditory representation.
Another auditory representation emphasizes temporal changes in the audiogram. In the `Audspec` class this is referred to as a `tgram`.
.. code-block:: Python
aud.make_tgram()
fig,ax = plt.subplots(1,figsize=(8,3))
ax.imshow(aud.tgram.T,origin='lower', aspect='auto',
extent=aud.extent, interpolation="spline36", cmap = plt.cm.afmhot)
ax.set(xlabel="Time (sec)", ylabel="Frequency (Bark)")
.. figure:: images/make_tgram.png
:scale: 50 %
:alt: The "tgram" auditory representation, showing the gradient of change in critical bands
:align: center
(top) The "tgram" auditory representation, showing the gradient of change in critical bands.
Methods and Attributes in an Audspec object
===========================================
"""
def hz2bark(self, hz):
return 7 * np.arcsinh(hz/650)
def bark2hz(self, bark):
return 650 * np.sinh(bark/7)
def _make_cb_filters(self):
"""
Create and return 2d array of Patterson filters for DFT spectra based
on attribute values in `self`.
The returned filters are stored in an 2d array in which the
rows represent the filter frequency bands in ascending order. The
columns contain symmetrical filter coefficients as determined by the
Patterson (1976) formula and centered at the filter frequency in the
DFT spectrum. Filter coefficients outside the frequency band are set
to 0.0.
The one-sided length of filter coefficients for each band is stored
in the `cbfiltn` attribute. The number of coefficients in the
symmetrical filter for band `j` is therefore
`(self.cbfiltn[j] * 2) - 1`. In a few bands this calculation might not
be correct since the coefficients may not fit when the center frequency
is near the left or right edge of the DFT spectrum. In such cases the
coefficients are truncated, and the actual number of coefficients for
the band `j` can be found with `np.sum(self.cbfilts[j] != 0.0)`.
References
----------
R. D. Patterson (1976) Auditory filter shapes derived with noise stimuli. `JASA` **59** , 640-54.
"""
cbfilts = np.zeros([len(self.freqs), len(self.sgram)])
dfreq = np.arange(self.maxcbfiltn) * self.inc
cbfiltn = np.searchsorted(dfreq, self.freqs / 5)
cbfiltn[cbfiltn > self.maxcbfiltn] = self.maxcbfiltn
self.cbfiltn = cbfiltn
for j, iidx in enumerate(cbfiltn):
cbfilt = np.zeros(self.maxcbfiltn)
bw = 10.0 ** ( (8.3 * np.log10(self.freqs[j]) - 2.3) / 10.0 )
hsq = np.exp(-np.pi * ((dfreq[:iidx] / bw) ** 2))
cbfilt[:iidx] = np.sqrt(hsq)
cbfilt /= cbfilt[0] + np.sum(cbfilt[1:] * 2)
# Make a symmetrical array of coefficients centered at loc.
# [n, n-1, ..., 2, 1, 0, 1, 2, ... n-1, n]
loc = (self.freqs[j] / self.inc).astype(int) # get location in dft spectrum
left_n = iidx if iidx <= loc else loc
right_n = iidx if loc + iidx < (self.dft_n / 2) else int(self.dft_n / 2) - loc
coef = np.append(np.flip(cbfilt[:left_n])[:-1], cbfilt[:right_n])
startidx = loc - left_n + 1
endidx = loc + right_n
cbfilts[j, startidx:endidx] = coef
return cbfilts
def _make_sgram(self, data, *args, **kwargs):
'''
Private function to make an acoustic spectrogram via rfft().
And add equal loudness contour.
Parameters
----------
data: 1d array
Audio data.
kwargs: dict, optional
Keyword arguments will be passed to the scipy.fft.rfft() function.
Returns
-------
A tuple, consisting of:
sgram: 2D array
The acoustic spectrogram of shape (times, frequency bins).
spect_times: 1D array
The times of each spectral slice in `spect`.
'''
data = np.pad(data, [int(self.dft_n/2), int(self.dft_n/2)], 'constant')
if (np.max(data) < 1): # floating point but we need 16bit int range
data = (data*(2**15)) #.astype(np.int32)
hop = int(self.step_size * self.fs)
frames = librosa.util.frame(data,frame_length=self.dft_n,hop_length=hop
).transpose()
t = librosa.frames_to_time(np.arange(frames.shape[0]),
sr=self.fs,hop_length=hop,n_fft=self.dft_n)
# Add some noise, then scale frames by the window.
frames = (frames + np.random.normal(0, 1, frames.shape)) * np.hamming(self.dft_n)
A = 2/self.dft_n * rfft(frames, **kwargs)
sgram = (np.abs(A)).astype(self.sgram.dtype)
return (sgram, t)
[docs]
def make_zgram(self, x, fs, target_fs=16000, preemph = 0, **kwargs):
'''Make an auditory spectrogram by creating an acoustic spectrogram and then applying critical-band filters to it, using the filter shapes described by Patterson (1976). The function creates the auditory spectrogram and stores it in `self.zgram`.
Parameters
----------
x : ndarray
Audio data as a one dimensional numpy array
fs : int, default = 16000
the sampling rate of the audio in **x**.
preemp : float, default = 1.0
The amount of preemphasis to apply before filtering.
**kwargs: dict, optional
Keyword arguments to be passed to scipy.fft.rfft()
References
----------
R. D. Patterson (1976) Auditory filter shapes derived with noise stimuli. `J. Acoust. Soc. Am.` **59** , 640-54.
'''
x, fs = prep_audio(x, fs = fs, target_fs = self.fs, pre = preemph)
(sgram, self.time_axis) = self._make_sgram(x, kwargs)
self.sgram = sgram + self.loud
zgram = self.sgram[:, np.newaxis, :] * self.cbfilts[np.newaxis, :, :]
self.zgram = 20 * np.log10(zgram.sum(axis=2)+1)
self.extent = (min(self.time_axis),max(self.time_axis),min(self.zfreqs),max(self.zfreqs))
def create_sharp_filter(self, span=4, mult=3, dimension="frequency"):
if (dimension=="frequency"): # default value
steps = int(span / self._zinc)
else: # otherwise assume temporal sharpening
steps = int(span / self.step_size)
if steps % 2 == 0:
steps += 1
sharp = np.full(steps, -1.0)
mid = int(steps / 2)
sharp[mid] = steps * mult
sharp /= sharp.sum()
return sharp
def create_blur_filter(self, span=3, sigma=1.5):
steps = int(span / self._zinc)
if steps % 2 == 0:
steps += 1
mid = int(steps / 2)
blur = 1 / (np.sqrt(np.pi*2) * sigma) * \
np.exp(((np.arange(-mid, mid+1) ** 2) * -1) / (2 * sigma**2))
blur /= blur.sum()
return blur
def apply_filt(self, gram, filt, axis=0, half_rectify=True):
# Make the axis to act on the first dimension if required.
if axis == 1:
gram = gram.transpose()
# Do convolution along the first dimension.
agram = np.zeros_like(gram)
mid = (len(filt) - 1) / 2
for j in np.arange(gram.shape[0]):
agram[j] = np.convolve(
np.pad(gram[j], int(mid), mode='edge'),
filt,
mode='valid'
)
# Do half-wave rectification if requested.
if half_rectify is True:
agram[agram < 0] = 0
# Rescale spectrogram values as relative magnitude.
agram = (agram - np.min(agram)) / (np.max(agram) - np.min(agram))
if axis == 1:
return agram.transpose()
else:
return agram
[docs]
def make_sharpgram(self,span=6, mult=1, dimension = "frequency"):
'''Sharpens the frequency distinctions or temporal dimension in the auditory spectrogram and stores the resulting sharpened spectrogram in the class property `self.sharpgram`. Note that `make_zgram()` must be called before calling this function.
Parameters
----------
span : scalar, default = 6
The time (in seconds) or frequency (in Bark) range of the filter
mult : scalar, default = 1
The degree of sharpening, larger value gives more contrast
dimension : string, default = "frequency"
For sharpening in the "frequency" domain or the "time" domain.
'''
if len(self.zgram.shape)==1:
print("Call make_zgram() before calling make_sharpgram()")
return
sharpen = self.create_sharp_filter(span, mult,dimension)
self.sharpgram = self.apply_filt(self.zgram, sharpen, axis=0, half_rectify=True) # frequency sharpening
[docs]
def make_blurgram(self,span=3, sigma=1.5):
'''Blur the frequency contrasts in the auditory spectrogram using a 1d Gaussian blur filter. The resulting blurred auditory spectrogram is stored in the class property `self.blurgram`. Note that `make_zgram()` must be called before calling this function.
Parameters
----------
span: scalar, default = 3
Frequency range, in Bark, over which the filter blurs
sigma: scalar, default = 1.5
The variance of the Gaussian function
'''
if len(self.zgram.shape)==1:
print("Call make_zgram() before calling make_blurgram()")
return
blur = self.create_blur_filter(span, sigma)
self.blurgram = self.apply_filt(self.zgram, blur, axis=0, half_rectify=True) # frequency blurring
[docs]
def make_tgram(self):
'''Compute the change in energy in each critical band in the auditory spectrogram. The tgram is positive when the amplitude is increasing, and negative when the amplitude in a critical band is decreasing. The resulting temporal contrast auditory spectrogram is stored in the class property `self.tgram`. Note that `make_zgram()` must be called before calling this function.
'''
if len(self.zgram.shape)==1:
print("Call make_zgram() before calling make_tgram()")
return
self.tgram = stats.zscore(np.gradient(self.zgram,axis=0),axis=0) # temporal change
[docs]
def savez(self, fname, **kwargs):
'''Calls numpy.savez to save all of the properties of the audspec object.
Parameters
----------
fname : string
Name of the file in which to save the data. Should end in ".npz"
'''
np.savez(
fname,
**self.__dict__,
**kwargs,
**{'custom_vars': list(kwargs.keys())}
)
def __init__(self, fs=16000, step_size=0.03):
float_t=np.float32
int_t=np.int32
super(Audspec, self).__init__()
self.fs = fs
self.dft_n = 2**(int_t(np.log2(0.05*fs))) # choose fft size based on fs
spect_points = int_t(self.dft_n/2) + 1
self._topbark = self.hz2bark(self.fs/2.0)
self.ncoef = int_t(self._topbark * 3.5) # number of points in the auditory spectrum
self._zinc = self._topbark/(self.ncoef+1)
self.inc = self.fs/self.dft_n; # get hz stepsize in fft
self.fft_freqs = np.arange(1, spect_points+1) * self.inc
self.sgram = np.zeros(spect_points, dtype=float_t) #: ndarray - 2d acoustic narrow band spectrogram
self.zgram = np.zeros(self.ncoef, dtype=float_t) #: ndarray - 2d auditory spectrogram
self.sharpgram = np.zeros(self.ncoef,dtype=float_t) #: ndarray - frequency sharpened auditory spectrogram
self.blurgram = np.zeros(self.ncoef,dtype=float_t) #: ndarray - blurred auditory spectrogram
self.tgram = np.zeros(self.ncoef,dtype=float_t) #: ndarray - temporal contrast auditory spectrogram
self.zfreqs = np.arange(1, self.ncoef+1) * self._zinc #: ndarray - Center frequencies of the critical bands in Bark
self.freqs = self.bark2hz(self.zfreqs) #: ndarray - Center frequencies of the critical bands in Hz
self.step_size = float_t(step_size) #: temporal interval between frames in sec
self.time_axis = np.zeros(0, dtype=float_t) #: ndarray - time axis for auditory spectrogram
self.extent = [0,0,0,0] #: ndarray - [xmin,xmax,ymin,ymax] plotting limits of the auditory spectrogram for imshow()
self.maxcbfiltn = int_t(100) # number of points in biggest CB filter
self.cbfilts = self._make_cb_filters().astype(float_t)
loudpoly = Polynomial([22.57, -11.46, -52.58, 226.97, 41.05, -1415.86,
925.53, 5216.88, -5157.93, -10245.93, 11386.57,
9702.65, -11213.73, -3484.65, 4079.037], domain=[20,20000])
self.loud = (10.0**(-loudpoly(self.fft_freqs)/10.0)).astype(float_t)