Source code for phonlab.utils.prep_audio_

__all__=['prep_audio']

import numpy as np
from scipy.signal import resample_poly


[docs] def prep_audio(x, fs, target_fs=32000, pre = 0, scale = True, add_tiny_noise = True, outtype = "float", pad_to = 0.0, quiet = False): """ Prepare an array of audio waveform samples for acoustic analysis. Parameters ========== x : array a one-dimensional numpy array with audio samples in it. fs : int The sampling rate of the sound in **x**. target_fs : int, default=32000 The desired sampling rate of the audio samples that will be returned by the function. Set **target_fs = None** if you don't want to change the sampling rate. pre : float, default = 0 how much high frequency preemphasis to apply (between 0 and 1). scale: boolean, default = True scale the samples to use the full range for audio samples (based on the peak amplitude in the signal) add_tiny_noise: boolean, default = True add a tiny bit of noise to the audio to avoid problematic waveforms with many samples at zero amplitude. pad_to: float, default = 0.0 add samples so duration is a multiple of `pad_to`. For example, if the duration is 1.99 seconds and `pad_to` is 0.1 then the signal will be padded to 2.0 seconds outtype : string {"float", "int"), default = "float" The "int" waveform is 16 bit integers - in the range from [-32768, 32767]. The "float" waveform is 32 bit floating point numbers - in the range from [-1, 1]. Returns ======= y : ndarray a 1D numpy array with audio samples fs : int the sampling rate of the audio in **y**. Note ==== By default, this function will return audio with a sampling rate of 32 kHz and scaled to be in the range from [1,-1] Example ======= Open a sound file and prepare it for acoustic analysis. By default, prep_audio() will resample the audio to a sampling rate of 32000, and scale the waveform to use the full range. In this example, we have also asked the function to apply a preemphasis factor of 1 (about 6dB/octave). .. code-block:: Python y,fs = phon.loadsig("sound.wav",chansel=[0]) x,fs = phon.prep_audio(y, fs, pre=1) Take the right channel, and resample to 16,000 Hz .. code-block:: Python *chans,fs = phon.loadsig("sound.wav") print(f'the old sampling rate is: {fs}') y,fs = phon.prep_audio(chans[1],fs, target_fs=16000) print(f'the new sampling rate is: {fs}') """ if target_fs == None: target_fs = fs x2 = x else: # resample to 'target_fs' samples per second if target_fs==fs: x2 = x else: if not quiet: print(f'Prep Audio: Resampling from {fs} to {target_fs}') cd = np.gcd(fs,target_fs) # common denominator x2 = resample_poly(x,up=target_fs/cd, down=fs/cd) #resample_ratio = target_fs/fs #new_size = int(len(x) * resample_ratio) # size of the resampled version #x2 = resample(x,new_size) # now sampled at desired sampling freq if (np.max(x2) + np.min(x2)) < 0: x2 = -x2 # set the polarity of the signal if (pre > 0): y = np.append(x2[0], x2[1:] - pre * x2[:-1]) # apply pre-emphasis else: y = x2 if scale: y = y/np.max(y) * 0.9 # scale to about full range if add_tiny_noise: y = y + ((np.random.rand(len(y)) - 0.5) * 0.00001) if pad_to > 0: # check whether any extra are needed extra_time = pad_to - ((len(y)/target_fs) % pad_to) extra_samples = int(extra_time * target_fs) y = np.concatenate((y,(np.random.rand(extra_samples) - 0.5) * 0.00001)) if not quiet: print(f"Prep Audio: Padding signal to {pad_to} sec, which involves adding {extra_samples} extra samples.") if outtype == "int": y = np.rint(np.iinfo(np.int16).max * y).astype(np.int16) if outtype == "int16": y = np.rint(np.iinfo(np.int16).max * y).astype(np.int16) return y,target_fs