def stft(wav, n_fft=1024, overlap=4, dt=tf.int32, absp=False): assert (wav.shape[0] > n_fft) X = tf.placeholder(dtype=dt,shape=wav.shape) X = tf.cast(X,tf.float32) hop = n_fft / overlap ## prepare constant variable Pi = tf.constant(np.pi, dtype=tf.float32) W = tf.constant(scipy.hanning(n_fft), dtype=tf.float32) S = tf.pack([tf.fft(tf.cast(tf.multiply(W,X[i:i+n_fft]),\ tf.complex64)) for i in range(1, wav.shape[0] - n_fft, hop)]) abs_S = tf.complex_abs(S) sess = tf.Session() if absp: return sess.run(abs_S, feed_dict={X:wav}) else: return sess.run(S, feed_dict={X:wav})
def griffin_lim(mag, phase_angle, n_fft, hop, num_iters): """Iterative algorithm for phase retrival from a magnitude spectrogram. Args: mag: Magnitude spectrogram. phase_angle: Initial condition for phase. n_fft: Size of the FFT. hop: Stride of FFT. Defaults to n_fft/2. num_iters: Griffin-Lim iterations to perform. Returns: audio: 1-D array of float32 sound samples. """ fft_config = dict(n_fft=n_fft, win_length=n_fft, hop_length=hop, center=True) ifft_config = dict(win_length=n_fft, hop_length=hop, center=True) complex_specgram = inv_magphase(mag, phase_angle) for i in range(num_iters): audio = librosa.istft(complex_specgram, **ifft_config) if i != num_iters - 1: complex_specgram = librosa.stft(audio, **fft_config) _, phase = librosa.magphase(complex_specgram) phase_angle = np.angle(phase) complex_specgram = inv_magphase(mag, phase_angle) return audio
def make_spectrum(self, filename, use_normalize): sr, y = wav.read(filename) if sr != 16000: raise ValueError('Sampling rate is expected to be 16kHz!') if y.dtype!='float32': y = np.float32(y/32767.) D=librosa.stft(y,n_fft=512,hop_length=256,win_length=512,window=scipy.signal.hamming) Sxx=np.log10(abs(D)**2) if use_normalize: mean = np.mean(Sxx, axis=1).reshape((257,1)) std = np.std(Sxx, axis=1).reshape((257,1))+1e-12 Sxx = (Sxx-mean)/std slices = [] for i in range(0, Sxx.shape[1]-self.FRAMELENGTH, self.OVERLAP): slices.append(Sxx[:,i:i+self.FRAMELENGTH]) return np.array(slices)
def test_stft_istft(self): try: import librosa ds = F.load_digit_wav() name = ds.keys()[0] path = ds[name] y, _ = speech.read(path, pcm=True) hop_length = int(0.01 * 8000) stft = signal.stft(y, n_fft=256, hop_length=hop_length, window='hann') stft_ = librosa.stft(y, n_fft=256, hop_length=hop_length, window='hann') self.assertTrue(np.allclose(stft, stft_.T)) y1 = signal.istft(stft, hop_length=hop_length, window='hann') y2 = librosa.istft(stft_, hop_length=hop_length, window='hann') self.assertTrue(np.allclose(y1, y2)) except ImportError: print("test_stft_istft require librosa.")
def griffinlim(spectrogram, n_iter=50, window='hann', n_fft=2048, win_length=2048, hop_length=-1, verbose=False): if hop_length == -1: hop_length = n_fft // 4 angles = np.exp(2j * np.pi * np.random.rand(*spectrogram.shape)) t = tqdm(range(n_iter), ncols=100, mininterval=2.0, disable=not verbose) for i in t: full = np.abs(spectrogram).astype(np.complex) * angles inverse = librosa.istft(full, hop_length = hop_length, win_length = win_length, window = window) rebuilt = librosa.stft(inverse, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window = window) angles = np.exp(1j * np.angle(rebuilt)) if verbose: diff = np.abs(spectrogram) - np.abs(rebuilt) t.set_postfix(loss=np.linalg.norm(diff, 'fro')) full = np.abs(spectrogram).astype(np.complex) * angles inverse = librosa.istft(full, hop_length = hop_length, win_length = win_length, window = window) return inverse
def __call__(self, y): """Short-time Fourier transform (STFT). Returns a real-valued matrix Returns a complex-valued matrix D such that `np.abs(D[f, t])` is the magnitude of frequency bin `f` at frame `t` `np.angle(D[f, t])` is the phase of frequency bin `f` at frame `t` Parameters ---------- y : np.ndarray [shape=(n,)], real-valued the input signal (audio time series) Returns ------- D : np.ndarray [shape=(1 + n_fft/2, t), dtype=dtype] STFT matrix """ return librosa.stft(y, **self.__dict__)
def extract_features(file_name): X, sample_rate = librosa.load(file_name) stft = np.abs(librosa.stft(X)) mfccs = np.array(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=8).T) chroma = np.array(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T) mel = np.array(librosa.feature.melspectrogram(X, sr=sample_rate).T) contrast = np.array(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T) tonnetz = np.array(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T) return mfccs,chroma,mel,contrast,tonnetz
def extract_features(): X = sounddevice.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1) sounddevice.wait() X= np.squeeze(X) stft = np.abs(librosa.stft(X)) mfccs = np.array(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=8).T) chroma = np.array(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T) mel = np.array(librosa.feature.melspectrogram(X, sr=sample_rate).T) contrast = np.array(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T) tonnetz = np.array(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T) ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) features = np.vstack([features,ext_features]) return features
def _complex_spectrogram(self) -> ndarray: return librosa.stft(y=self.get_raw_audio(), n_fft=self.fourier_window_length, hop_length=self.hop_length)
def _griffin_lim(S, n_fft, win_length, hop_length, num_iters): angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) S_complex = np.abs(S).astype(np.complex) for i in range(num_iters): if i > 0: angles = np.exp(1j * np.angle(librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))) y = librosa.istft(S_complex * angles, hop_length=hop_length, win_length=win_length) return y
def create_spectrogram_from_audio(data): global setting spectrogram = librosa.stft(data, n_fft=Config.n_fft, hop_length=Config.hop_length).transpose() # divide the real and imaginary components of each element # concatenate the matrix with the real components and the matrix with imaginary components # (DataCorruptionError when saving complex numbers in TFRecords) # concatenated = np.concatenate([np.real(spectrogram), np.imag(spectrogram)], axis=1) return spectrogram # [num_time_frames, num_freq_bins]
def plot_log_power_specgram(sound_names,raw_sounds): i = 1 fig = plt.figure(figsize=(25,60), dpi = 900) for n,f in zip(sound_names,raw_sounds): plt.subplot(10,1,i) D = librosa.logamplitude(np.abs(librosa.stft(f))**2, ref_power=np.max) librosa.display.specshow(D,x_axis='time' ,y_axis='log') plt.title(n.title()) i += 1 plt.suptitle('Figure 3: Log power spectrogram',x=0.5, y=0.915,fontsize=18) plt.show()
def extract_feature(file_name): X, sample_rate = librosa.load(file_name) stft = np.abs(librosa.stft(X)) mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0) chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0) mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0) contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0) tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0) return mfccs,chroma,mel,contrast,tonnetz
def get_spectrograms(sound_file): '''Extracts melspectrogram and log magnitude from given `sound_file`. Args: sound_file: A string. Full path of a sound file. Returns: Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels) Transposed magnitude: A 2d array.Has shape of (T, 1+hp.n_fft//2) ''' # Loading sound file y, sr = librosa.load(sound_file, sr=None) # or set sr to hp.sr. # stft. D: (1+n_fft//2, T) D = librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length) # magnitude spectrogram magnitude = np.abs(D) # (1+n_fft/2, T) # power spectrogram power = magnitude ** 2 # (1+n_fft/2, T) # mel spectrogram S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) # (n_mels, T) return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32)) # (T, n_mels), (T, 1+n_fft/2)
def spectrogram2wav(spectrogram): ''' spectrogram: [t, f], i.e. [t, nfft // 2 + 1] ''' spectrogram = spectrogram.T # [f, t] X_best = copy.deepcopy(spectrogram) # [f, t] for i in range(hp.n_iter): X_t = invert_spectrogram(X_best) est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) # [f, t] phase = est / np.maximum(1e-8, np.abs(est)) # [f, t] X_best = spectrogram * phase # [f, t] X_t = invert_spectrogram(X_best) return np.real(X_t)
def make_spectrum_phase(y, FRAMESIZE, OVERLAP, FFTSIZE): D=librosa.stft(y,n_fft=FRAMESIZE,hop_length=OVERLAP,win_length=FFTSIZE,window=scipy.signal.hamming) Sxx = np.log10(abs(D)**2) phase = np.exp(1j * np.angle(D)) mean = np.mean(Sxx, axis=1).reshape((257,1)) std = np.std(Sxx, axis=1).reshape((257,1))+1e-12 Sxx = (Sxx-mean)/std return Sxx, phase, mean, std
def get_feature_aqibsaeed_1(X, sr, au_path=None): """ http://aqibsaeed.github.io/2016-09-03-urban-sound-classification-part-1/ """ import librosa if au_path is not None: X, sr = librosa.load(au_path) stft = np.abs(librosa.stft(X)) mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0) chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T,axis=0) mel = np.mean(librosa.feature.melspectrogram(X, sr=sr).T,axis=0) contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T,axis=0) tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sr).T,axis=0) feature = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) return feature
def transform_audio(self, y): '''Compute the STFT magnitude and phase. Parameters ---------- y : np.ndarray The audio buffer Returns ------- data : dict data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2) STFT magnitude data['phase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2) STFT phase ''' n_frames = self.n_frames(get_duration(y=y, sr=self.sr)) D = stft(y, hop_length=self.hop_length, n_fft=self.n_fft) D = fix_length(D, n_frames) mag, phase = magphase(D) if self.log: mag = amplitude_to_db(mag, ref=np.max) return {'mag': mag.T[self.idx].astype(np.float32), 'phase': np.angle(phase.T)[self.idx].astype(np.float32)}
def compute_spec(audio_file,spectro_file): # Get actual audio audio, sr = librosa.load(audio_file, sr=config['resample_sr']) # Compute spectrogram if config['spectrogram_type']=='cqt': spec = librosa.cqt(audio, sr=sr, hop_length=config['hop'], n_bins=config['cqt_bins'], real=False) elif config['spectrogram_type']=='mel': spec = librosa.feature.melspectrogram(y=audio, sr=sr, hop_length=config['hop'],n_fft=config['n_fft'],n_mels=config['n_mels']) elif config['spectrogram_type']=='stft': spec = librosa.stft(y=audio,n_fft=config['n_fft']) # Write results: with open(spectro_file, "w") as f: pickle.dump(spec, f, protocol=-1) # spec shape: MxN.
def expand(self, audio): ori_len = audio.shape[0] tmp = resample(audio, r=0.5, type='sinc_best') down_len = tmp.shape[0] tmp = resample(tmp, r=(ori_len+1) / float(down_len), type='sinc_best') tmp = librosa.stft(audio, 1024) phase = np.divide(tmp, np.abs(tmp)) spec_input = np.abs(librosa.stft(audio, 1024))[0:n_input, ::] spec_input = spec_input[::, 0:spec_input.shape[1]//n_len*n_len] spec_input = np.split(spec_input, spec_input.shape[1]//n_len, axis=1) spec_input = np.asarray(spec_input) spec_input = np.expand_dims(spec_input, axis=-1) feed_dict = {self.input_op: np.log1p(spec_input) / 12.0} debug = self.sess.run(self.debug_op, feed_dict=feed_dict) np.save('debug.npy', debug) S = self.sess.run(self.eva_op, feed_dict=feed_dict) S[S >= 5e3] = 5e3 S[S <= 0] = 0 print ('mean', np.mean(S)) print (np.sum(np.isinf(S))) S = np.squeeze(np.concatenate(np.split(S, S.shape[0]), axis=2), axis=(0, -1)) phase = phase[..., :S.shape[1]] print (phase.shape) print (S.shape) print (np.sum(np.isinf(np.multiply(S, phase)))) X = librosa.istft(np.multiply(S, phase)) return X
def get_spectrograms(sound_file): '''Extracts melspectrogram and log magnitude from given `sound_file`. Args: sound_file: A string. Full path of a sound file. Returns: Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels) Transposed magnitude: A 2d array.Has shape of (T, 1+hp.n_fft//2) ''' # Loading sound file y, sr = librosa.load(sound_file, sr=hp.sr) # or set sr to hp.sr. # stft. D: (1+n_fft//2, T) D = librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length) # magnitude spectrogram magnitude = np.abs(D) #(1+n_fft/2, T) # power spectrogram power = magnitude**2 #(1+n_fft/2, T) # mel spectrogram S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) #(n_mels, T) return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32)) # (T, n_mels), (T, 1+n_fft/2)
def source_separation(self, x): if not Duration()(x) > 10: stftx = librosa.stft(x) real = stftx.real imag = stftx.imag ssp = find_sparse_source_points(real, imag) #find sparsity in the signal cos_dist = cosine_distance(ssp) #cosine distance from sparse data sources = find_number_of_sources(cos_dist) #find possible number of sources if (sources == 0) or (sources == 1): #this means x is an instrumental track and doesn't have more than one source print "There's only one visible source" return x else: print "Separating sources" xs = NMF(stftx, sources) return xs[0] #take the bass part #TODO: correct NMF to return noiseless reconstruction else: stftx = librosa.stft(x[:441000]) #take 10 seconds of signal data to find sources print "It can take some time to find any source in this signal" real = stftx.real imag = stftx.imag ssp = find_sparse_source_points(real, imag) #find sparsity in the signal cos_dist = cosine_distance(ssp) #cosine distance from sparse data sources = find_number_of_sources(cos_dist) #find possible number of sources if (sources == 0) or (sources == 1): #this means x is an instrumental track and doesn't have more than one source print "There's only one visible source" return x else: print "Separating sources" xs = NMF(librosa.stft(x), sources) return xs[0] #take the bass part #TODO: correct NMF to return noiseless reconstruction
def sad_music_remix(self, neg_arous_dir, files, decisions, harmonic = None): for subdirs, dirs, sounds in os.walk(neg_arous_dir): fx = random.choice(sounds[::-1]) fy = random.choice(sounds[:]) x = MonoLoader(filename = neg_arous_dir + '/' + fx)() y = MonoLoader(filename = neg_arous_dir + '/' + fy)() fx = fx.split('.')[0] fy = fy.split('.')[0] fx = np.where(files == fx)[0][0] fy = np.where(files == fy)[0][0] if harmonic is False or None: dec_x = get_coordinate(fx, 1, decisions) dec_y = get_coordinate(fy, 1, decisions) else: dec_x = get_coordinate(fx, 2, decisions) dec_y = get_coordinate(fy, 2, decisions) x = self.source_separation(x) x = scratch_music(x, dec_x) x = x[np.nonzero(x)] y = scratch_music(y, dec_y) y = y[np.nonzero(y)] x, y = same_time(x,y) negative_arousal_samples = [i/i.max() for i in (x,y)] negative_arousal_x = np.array(negative_arousal_samples).sum(axis=0) negative_arousal_x = 0.5*negative_arousal_x/negative_arousal_x.max() if harmonic is True: return librosa.decompose.hpss(librosa.stft(negative_arousal_x), margin = (1.0, 5.0))[0] if harmonic is False or harmonic is None: onsets = hfc_onsets(np.float32(negative_arousal_x)) interv = seconds_to_indices(onsets) steps = overlapped_intervals(interv) output = librosa.effects.remix(negative_arousal_x, steps[::-1], align_zeros = False) output = librosa.effects.pitch_shift(output, sr = 44100, n_steps = 3) remix_filename = 'data/emotions/remixes/sad/'+str(time.strftime("%Y%m%d-%H:%M:%S"))+'multitag_remix.ogg' MonoWriter(filename=remix_filename, format = 'ogg', sampleRate = 44100)(np.float32(output)) subprocess.call(["ffplay", "-nodisp", "-autoexit", remix_filename])
def happy_music_remix(self, pos_arous_dir, files, decisions, harmonic = None): for subdirs, dirs, sounds in os.walk(pos_arous_dir): fx = random.choice(sounds[::-1]) fy = random.choice(sounds[:]) x = MonoLoader(filename = pos_arous_dir + '/' + fx)() y = MonoLoader(filename = pos_arous_dir + '/' + fy)() fx = fx.split('.')[0] fy = fy.split('.')[0] fx = np.where(files == fx)[0][0] fy = np.where(files == fy)[0][0] if harmonic is False or None: dec_x = get_coordinate(fx, 3, decisions) dec_y = get_coordinate(fy, 3, decisions) else: dec_x = get_coordinate(fx, 0, decisions) dec_y = get_coordinate(fy, 0, decisions) x = self.source_separation(x) x = scratch_music(x, dec_x) y = scratch_music(y, dec_y) x = x[np.nonzero(x)] y = y[np.nonzero(y)] x, y = same_time(x,y) positive_arousal_samples = [i/i.max() for i in (x,y)] positive_arousal_x = np.float32(positive_arousal_samples).sum(axis=0) positive_arousal_x = 0.5*positive_arousal_x/positive_arousal_x.max() if harmonic is True: return librosa.decompose.hpss(librosa.stft(positive_arousal_x), margin = (1.0, 5.0))[0] if harmonic is False or harmonic is None: interv = RhythmExtractor2013()(positive_arousal_x)[1] * 44100 steps = overlapped_intervals(interv) output = librosa.effects.remix(positive_arousal_x, steps, align_zeros = False) output = librosa.effects.pitch_shift(output, sr = 44100, n_steps = 4) remix_filename = 'data/emotions/remixes/happy/'+str(time.strftime("%Y%m%d-%H:%M:%S"))+'multitag_remix.ogg' MonoWriter(filename=remix_filename, format = 'ogg', sampleRate = 44100)(np.float32(output)) subprocess.call(["ffplay", "-nodisp", "-autoexit", remix_filename])
def not_angry_music_remix(self, neg_arous_dir, files, decisions): sounds = [] for i in range(len(neg_arous_dir)): for subdirs, dirs, s in os.walk(neg_arous_dir[i]): sounds.append(subdirs + '/' + random.choice(s)) fx = random.choice(sounds[::-1]) fy = random.choice(sounds[:]) x = MonoLoader(filename = fx)() y = MonoLoader(filename = fy)() fx = fx.split('/')[1].split('.')[0] fy = fy.split('/')[1].split('.')[0] fx = np.where(files == fx)[0] fy = np.where(files == fy)[0] dec_x = get_coordinate(fx, choice(range(1,3)), decisions) dec_y = get_coordinate(fy, choice(range(1,3)), decisions) x = self.source_separation(x) x = scratch_music(x, dec_x) y = scratch_music(y, dec_y) x = x[np.nonzero(x)] y = y[np.nonzero(y)] x, y = same_time(x,y) morph = stft.morph(x1 = x,x2 = y,fs = 44100,w1=np.hanning(1025),N1=2048,w2=np.hanning(1025),N2=2048,H1=512,smoothf=0.1,balancef=0.7) onsets = hfc_onsets(np.float32(morph)) interv = seconds_to_indices(onsets) steps = overlapped_intervals(interv) output = librosa.effects.remix(morph, steps[::-1], align_zeros = False) output = librosa.effects.pitch_shift(output, sr = 44100, n_steps = 4) remix_filename = 'data/emotions/remixes/not angry/'+str(time.strftime("%Y%m%d-%H:%M:%S"))+'multitag_remix.ogg' MonoWriter(filename = remix_filename, sampleRate = 44100, format = 'ogg')(np.float32(output)) subprocess.call(["ffplay", "-nodisp", "-autoexit", remix_filename])
def not_relaxed_music_remix(self, pos_arous_dir, files, decisions): sounds = [] for i in range(len(pos_arous_dir)): for subdirs, dirs, s in os.walk(pos_arous_dir[i]): sounds.append(subdirs + '/' + random.choice(s)) fx = random.choice(sounds[::-1]) fy = random.choice(sounds[:]) x = MonoLoader(filename = fx)() y = MonoLoader(filename = fy)() fx = fx.split('/')[1].split('.')[0] fy = fy.split('/')[1].split('.')[0] fx = np.where(files == fx)[0] fy = np.where(files == fy)[0] dec_x = get_coordinate(fx, choice([0,1,3]), decisions) dec_y = get_coordinate(fy, choice([0,1,3]), decisions) x = self.source_separation(x) x = scratch_music(x, dec_x) y = scratch_music(y, dec_y) x = x[np.nonzero(x)] y = y[np.nonzero(y)] x, y = same_time(x,y) morph = stft.morph(x1 = x,x2 = y,fs = 44100,w1=np.hanning(1025),N1=2048,w2=np.hanning(1025),N2=2048,H1=512,smoothf=0.01,balancef=0.7) interv = RhythmExtractor2013()(np.float32(morph))[1] * 44100 steps = overlapped_intervals(interv) output = librosa.effects.remix(morph, steps[::-1], align_zeros = False) output = librosa.effects.pitch_shift(output, sr = 44100, n_steps = 3) remix_filename = 'data/emotions/remixes/not relaxed/'+str(time.strftime("%Y%m%d-%H:%M:%S"))+'multitag_remix.ogg' MonoWriter(filename = remix_filename, sampleRate = 44100, format = 'ogg')(np.float32(output)) subprocess.call(["ffplay", "-nodisp", "-autoexit", remix_filename])
def parse_audio(self, audio_path): if self.augment: y = load_randomly_augmented_audio(audio_path, self.sample_rate) else: y = load_audio(audio_path) if self.noiseInjector: add_noise = np.random.binomial(1, self.noise_prob) if add_noise: y = self.noiseInjector.inject_noise(y) n_fft = int(self.sample_rate * self.window_size) win_length = n_fft hop_length = int(self.sample_rate * self.window_stride) # STFT D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=self.window) spect, phase = librosa.magphase(D) # S = log(S+1) spect = np.log1p(spect) spect = torch.FloatTensor(spect) if self.normalize: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) return spect
def __init__(self, tex_wnd, fft_len=512, sr=22050): self.tex_wnd = tex_wnd self.an_wnd_len = fft_len self.sr = sr # calc signal spectrum self.fft_tex_wnds = np.abs( librosa.stft( y=tex_wnd, n_fft=fft_len, hop_length=fft_len, ) )
def stft_mc(x,N=1024,hop=None,window='hann'): # N=1024 if hop is None: hop=N/2 S=x.shape if len(S)==1: nch=1 nsampl=len(x) x=np.reshape(x,(1,nsampl)) else: nch=S[0] nsampl=S[1] xdtype=x.dtype nfram=int(scipy.ceil(float(nsampl)/float(hop))) npad=int(nfram)*hop-nsampl pad=np.zeros((nch,npad)).astype(xdtype) x=np.concatenate((x,pad),axis=1) #pad the edges to avoid window taper effects pad=np.zeros((nch,N)).astype(xdtype) x=np.concatenate((pad,x,pad),axis=1) for ich in range(0,nch): x0=x[ich,:] if not x0.flags.c_contiguous: x0=x0.copy(order='C') X0=librosa.core.stft(x0,n_fft=N,hop_length=hop,window=window,center=False,dtype=np.complex64) if ich==0: X=np.zeros((N/2+1,X0.shape[1],nch)).astype(np.complex64) X[:,:,0]=X0 else: X[:,:,ich]=X0 return X
def process_audio(fname, n_fft=2048, win_length=1200, hop_length=300, sr=16000): wave, sr = librosa.load(fname, mono=True, sr=sr) # trim initial silence wave, _ = librosa.effects.trim(wave) # first pad the audio to the maximum length # we ensure it is a multiple of 4r so it works with max frames assert math.ceil(maximum_audio_length / hop_length) % 4*r == 0 if wave.shape[0] <= maximum_audio_length: wave = np.pad(wave, (0,maximum_audio_length - wave.shape[0]), 'constant', constant_values=0) else: return None, None pre_emphasis = 0.97 wave = np.append(wave[0], wave[1:] - pre_emphasis * wave[:-1]) stft = librosa.stft(wave, n_fft=n_fft, win_length=win_length, hop_length=hop_length) mel = librosa.feature.melspectrogram(S=stft, n_mels=80) stft = np.log(np.abs(stft) + 1e-8) mel = np.log(np.abs(mel) + 1e-8) stft = reshape_frames(stft) mel = reshape_frames(mel) return mel, stft
def stft(x, fs, framesz, hop): framesamp = int(framesz*fs) hopsamp = int(hop*fs) w = scipy.hanning(framesamp) X = scipy.array([scipy.fft(w*x[i:i+framesamp]) for i in range(0, len(x)-framesamp, hopsamp)]) return X
def wave_to_complex_spectrogram(wave, fs): return librosa.stft(wave, n_fft=512, hop_length=128, win_length=512)
def _griffin_lim_tensorflow(S): '''TensorFlow implementation of Griffin-Lim Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb ''' with tf.variable_scope('griffinlim'): # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1 S = tf.expand_dims(S, 0) S_complex = tf.identity(tf.cast(S, dtype=tf.complex64)) y = _istft_tensorflow(S_complex) for i in range(hparams.griffin_lim_iters): est = _stft_tensorflow(y) angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64) y = _istft_tensorflow(S_complex * angles) return tf.squeeze(y, 0)
def _stft(y): n_fft, hop_length, win_length = _stft_parameters() return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
def _stft_tensorflow(signals): n_fft, hop_length, win_length = _stft_parameters() return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
def get_spectrogram(sound_fpath): '''Extracts melspectrogram and magnitude from given `sound_file`. Args: sound_fpath: A string. Full path of a sound file. Returns: Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels) Transposed magnitude: A 2d array. A transposed magnitude spectrogram with shape of (T, 1+hp.n_fft//2) ''' # Loading sound file y, sr = librosa.load(sound_fpath, sr=None) # or set sr to hp.sr. # stft. D: (1+n_fft//2, T) D = librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length) # magnitude spectrogram magnitude = np.abs(D) #(1+n_fft/2, T) # power spectrogram power = magnitude**2 # mel spectrogram S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) #(n_mels, T) return np.transpose(S.astype(np.float32))
def parse_audio(self, audio_path): if self.augment: y = load_randomly_augmented_audio(audio_path) else: y = load_audio(audio_path) if self.noiseInjector: add_noise = np.random.binomial(1, self.noise_prob) if add_noise: y = self.noiseInjector.inject_noise(y) n_fft = int(self.sample_rate * self.window_size) win_length = n_fft hop_length = int(self.sample_rate * self.window_stride) # STFT D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=self.window) spect, phase = librosa.magphase(D) # S = log(S+1) spect = np.log1p(spect) spect = torch.FloatTensor(spect) if self.normalize: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) return spect
def process_music_data(data_in, is_fft, is_energy, n_output_bins, n_fft, is_visual): # length is len(data_in)/4 data_np = np.fromstring(data_in, 'Float32') # visualizer if is_visual: visualizer(data_np) # energy if is_energy: energy = np.abs(data_np) ** 2 energy = energy.sum() energy *= 2**5 energy_output = energy.astype(np.uint16) else: energy_output = np.zeros(2).astype(np.uint16) # fft if is_fft: global sample_rate # down-sample by 4, with filtering, energy not scaled data_np = librosa.resample(data_np, sample_rate, sample_rate/4, res_type='kaiser_fast') # short time fft over n_fft samples fft_data = librosa.stft(data_np, n_fft, hop_length=n_fft, center=False) fft_data_mag = np.abs(fft_data[0:n_fft//2]) ** 2 # magnitude scaling fft_data_mag *= 2**3 fft_output = get_output_fft_bins(fft_data_mag, n_output_bins) fft_output = fft_output.astype(np.uint8) else: fft_output = np.zeros(n_output_bins).astype(np.uint8) return fft_output, energy_output
def feature_extraction(y=None, fs=None, statistics=True, include_mfcc0=True, include_delta=True, include_acceleration=True, mfcc_params=None, delta_params=None, acceleration_params=None): # Extract features, Mel Frequency Cepstral Coefficients eps = numpy.spacing(1) # Windowing function if mfcc_params['window'] == 'hamming_asymmetric': window = scipy.signal.hamming(mfcc_params['n_fft'], sym=False) elif mfcc_params['window'] == 'hamming_symmetric': window = scipy.signal.hamming(mfcc_params['n_fft'], sym=True) elif mfcc_params['window'] == 'hann_asymmetric': window = scipy.signal.hann(mfcc_params['n_fft'], sym=False) elif mfcc_params['window'] == 'hann_symmetric': window = scipy.signal.hann(mfcc_params['n_fft'], sym=True) else: window = None # Calculate Static Coefficients magnitude_spectrogram = numpy.abs(librosa.stft(y + eps, n_fft=mfcc_params['n_fft'], win_length=mfcc_params['win_length'], hop_length=mfcc_params['hop_length'], window=window))**2 mel_basis = librosa.filters.mel(sr=fs, n_fft=mfcc_params['n_fft'], n_mels=mfcc_params['n_mels'], fmin=mfcc_params['fmin'], fmax=mfcc_params['fmax'], htk=mfcc_params['htk']) mel_spectrum = numpy.dot(mel_basis, magnitude_spectrogram) mfcc = librosa.feature.mfcc(S=librosa.logamplitude(mel_spectrum)) # Collect the feature matrix feature_matrix = mfcc if include_delta: # Delta coefficients mfcc_delta = librosa.feature.delta(mfcc, **delta_params) # Add Delta Coefficients to feature matrix feature_matrix = numpy.vstack((feature_matrix, mfcc_delta)) if include_acceleration: # Acceleration coefficients (aka delta) mfcc_delta2 = librosa.feature.delta(mfcc, order=2, **acceleration_params) # Add Acceleration Coefficients to feature matrix feature_matrix = numpy.vstack((feature_matrix, mfcc_delta2)) if not include_mfcc0: # Omit mfcc0 feature_matrix = feature_matrix[1:, :] feature_matrix = feature_matrix.T # Collect into data structure if statistics: return { 'feat': feature_matrix, 'stat': { 'mean': numpy.mean(feature_matrix, axis=0), 'std': numpy.std(feature_matrix, axis=0), 'N': feature_matrix.shape[0], 'S1': numpy.sum(feature_matrix, axis=0), 'S2': numpy.sum(feature_matrix ** 2, axis=0), } } else: return { 'feat': feature_matrix}