我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用librosa.load()。
def getdata(self): # Structure for the array of songs song_data = [] genre_data = [] # Read files from the folders for x,_ in self.genres.items(): for root, subdirs, files in os.walk(self.file_path + x): for file in files: # Read the audio file file_name = self.file_path + x + "/" + file print(file_name) signal, sr = librosa.load(file_name) # Calculate the melspectrogram of the audio and use log scale melspec = librosa.feature.melspectrogram(signal[:self.song_samples], sr = sr, n_fft = self.n_fft, hop_length = self.hop_length).T[:1280,] # Append the result to the data structure song_data.append(melspec) genre_data.append(self.genres[x]) return np.array(song_data), keras.utils.to_categorical(genre_data, len(self.genres))
def __init__(self, audio_file: Path, id: Optional[str] = None, sample_rate_to_convert_to: int = 16000, label: Optional[str] = "nolabel", fourier_window_length: int = 512, hop_length: int = 128, mel_frequency_count: int = 128, label_with_tags: str = None, positional_label: Optional[PositionalLabel] = None): # The default values for hop_length and fourier_window_length are powers of 2 near the values specified in the wave2letter paper. if id is None: id = name_without_extension(audio_file) self.audio_file = audio_file super().__init__( id=id, get_raw_audio=lambda: librosa.load(str(self.audio_file), sr=self.sample_rate)[0], label=label, sample_rate=sample_rate_to_convert_to, fourier_window_length=fourier_window_length, hop_length=hop_length, mel_frequency_count=mel_frequency_count, label_with_tags=label_with_tags, positional_label=positional_label)
def __init__(self, images, labels, fake_data=False, one_hot=False, load=False): """Construct a DataSet. one_hot arg is used only if fake_data is true.""" if fake_data: self._num_examples = 10000 self.one_hot = one_hot else: num = len(images) assert num == len(labels), ('images.shape: %s labels.shape: %s' % (images.shape, labels.shape)) print("len(images) %d" % num) self._num_examples = num self.cache={} self._image_names = numpy.array(images) self._labels = labels self._epochs_completed = 0 self._index_in_epoch = 0 self._images=[] if load: # Otherwise loaded on demand self._images=self.load(self._image_names)
def read_data_sets(train_dir,source_data=Source.NUMBER_IMAGES, fake_data=False, one_hot=True): class DataSets(object): pass data_sets = DataSets() if fake_data: data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot) data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot) data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot) return data_sets VALIDATION_SIZE = 2000 local_file = maybe_download(source_data, train_dir) train_images = extract_images(TRAIN_INDEX,train=True) train_labels = extract_labels(TRAIN_INDEX,train=True, one_hot=one_hot) test_images = extract_images(TEST_INDEX,train=False) test_labels = extract_labels(TEST_INDEX,train=False, one_hot=one_hot) # train_images = train_images[:VALIDATION_SIZE] # train_labels = train_labels[:VALIDATION_SIZE:] # test_images = test_images[VALIDATION_SIZE:] # test_labels = test_labels[VALIDATION_SIZE:] data_sets.train = DataSet(train_images, train_labels , load=False) data_sets.test = DataSet(test_images, test_labels, load=True) # data_sets.validation = DataSet(validation_images, validation_labels, load=True) return data_sets
def load_audio(audio_filename, sample_rate): """Loads an audio file. Args: audio_filename: File path to load. sample_rate: The number of samples per second at which the audio will be returned. Resampling will be performed if necessary. Returns: A numpy array of audio samples, single-channel (mono) and sampled at the specified rate, in float32 format. Raises: AudioIOReadException: If librosa is unable to load the audio data. """ try: y, unused_sr = librosa.load(audio_filename, sr=sample_rate, mono=True) except Exception as e: # pylint: disable=broad-except raise AudioIOReadException(e) return y
def get_seeds(self, audio_filepath): """Get the seeds file to pass to the HLL tracker. Parameters ---------- audio_filepath : str Path to audio file. Returns ------- seeds_fpath : str Path to the seeds output file. """ y, sr = librosa.load(audio_filepath, sr=44100) y_harmonic = librosa.effects.harmonic(y) cqt, samples, freqs = self._compute_cqt(y_harmonic, sr) seeds = self._pick_seeds_cqt(cqt, freqs, samples) seeds_fpath = tmp.mktemp('.csv') with open(seeds_fpath, 'w') as fhandle: writer = csv.writer(fhandle, delimiter=',') writer.writerows(seeds) return seeds_fpath
def compute_spectrograms(filename): out_rate = 12000 N_FFT = 512 HOP_LEN = 256 frames, rate = librosa.load(filename, sr=out_rate, mono=True) if len(frames) < out_rate*3: # if less then 3 second - can't process raise Exception("Audio duration is too short") logam = librosa.logamplitude melgram = librosa.feature.melspectrogram x = logam(melgram(y=frames, sr=out_rate, hop_length=HOP_LEN, n_fft=N_FFT, n_mels=N_MEL_BANDS) ** 2, ref_power=1.0) # now going through spectrogram with the stride of the segment duration for start_idx in range(0, x.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR): yield x[:, start_idx:start_idx + SEGMENT_DUR]
def predict_on_long_clips(): """Load the saved model and perform inference/prediction on features obtained from inputs. Splits the audio into 10second chunks and predicts on those chunks.""" with open(FILENAMES,"r") as fh: filecontents=fh.read() filenames=filecontents.splitlines() random.shuffle(filenames) filenames=filenames[:5] #[:5] is for quickly verifying if things work filenames = [DATASET_LOCATION+f for f in filenames] session = tf.Session() saver = tf.train.import_meta_graph(IMPORT_META_GRAPH) saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT)) tf.global_variables_initializer().run(session=session) test_x = {} for f in filenames: s, sr = librosa.load(f) total_chunks = s.shape[0]/max_audio_length waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)] test_x[f] = extract_features_from_waveforms(waveforms) print "FILENAME: ", f predictions = session.run(tf.argmax(pred, 1), feed_dict={X: test_x[f]}) print [possible_categories[p] for p in predictions]
def loadFile(self, fname): ''' fname: filename of the sound file we want to load ''' if self.verbose: print('Loading %s' % fname) if self.cached: if not os.path.exists(fname + '-mfcc.npy'): y, sr = librosa.load(fname) data = mfcc(y=y, sr=sr).T np.save(fname + '-mfcc.npy', data) else: data = np.load(fname + '-mfcc.npy') else: y, sr = librosa.load(fname) # TODO: Add ability to filter by seconds/duration # seconds = y.size/sr data = mfcc(y=y, sr=sr).T return data
def get_mfccs_and_deltas(wav_pathname, n_mfcc=13, n_fft=2048, freq_min=100, freq_max=16000): sample_array, sample_rate = librosa.load(wav_pathname, sr=44100) if len(sample_array) == 0: return [] else: mfcc = librosa.feature.mfcc(sample_array, sample_rate, n_fft=n_fft, hop_length=n_fft, n_mfcc=n_mfcc, fmin=freq_min, fmax=freq_max) delta = librosa.feature.delta(mfcc) delta2 = librosa.feature.delta(mfcc, order=2) mfcc = mfcc.T ### Transposing tables delta = delta.T ## (We can instead set the axis above to do this without the extra step) delta2 = delta2.T mfcc_sans_0th = [frame_values[1:] for frame_values in mfcc] all_features = [] for i in range(len(mfcc)): all_features.append(list(mfcc_sans_0th[i]) + list(delta[i]) + list(delta2[i])) return all_features
def recognise_mfcc(filePath,outputDir,outputName,debug): print("start decompose harmonic/percussive and extract mfcc {0}".format(filePath)) y,sr = librosa.load(filePath) mfcc = librosa.feature.mfcc(y=y,sr=sr) mfcc = np.transpose(mfcc) basePath = outputDir+outputName; np.savetxt(basePath+"_normal_mfcc.csv",mfcc,delimiter=",") harmonic_sep = 3.0 percussive_sep = 3.0 h,p = librosa.effects.hpss(y,margin=(harmonic_sep,percussive_sep)) hmfcc = librosa.feature.mfcc(y=h,sr=sr) hmfcc = np.transpose(hmfcc) np.savetxt(basePath+"_harmonic_mfcc.csv",hmfcc,delimiter=",") pmfcc = librosa.feature.mfcc(y=p,sr=sr) pmfcc = np.transpose(pmfcc) np.savetxt(basePath+"_percussive_mfcc.csv",pmfcc,delimiter=",") # extract rhythm patter with rp_extract
def load_generic_audio(directory, sample_rate): '''Generator that yields audio waveforms from the directory.''' files = find_files(directory) id_reg_exp = re.compile(FILE_PATTERN) print("files length: {}".format(len(files))) randomized_files = randomize_files(files) for filename in randomized_files: ids = id_reg_exp.findall(filename) if not ids: # The file name does not match the pattern containing ids, so # there is no id. category_id = None else: # The file name matches the pattern for containing ids. category_id = int(ids[0][0]) audio, _ = librosa.load(filename, sr=sample_rate, mono=True) audio = audio.reshape(-1, 1) yield audio, filename, category_id
def get_audio_analysis(song_url): if(song_url is None): return None, None, None, None, None urlretrieve(song_url, "current.mp3") y, sr = librosa.load("./current.mp3") # Tempo = beats/minute tempo, beats = librosa.beat.beat_track(y=y, sr=sr) # pitch = Frequency pitches, magnitudes = librosa.piptrack(y=y, sr=sr, fmax=1000, hop_length=1000) pitches, magnitudes = extract_max(pitches, magnitudes, pitches.shape) y[abs(y) < 10**-2] = 0 y = np.trim_zeros(y) json = { 'sound_wave': np.array(y[:len(pitches)]).tolist(), 'pitch': pitches } y_harm, y_per = librosa.effects.hpss(y) harm, perc = audio_fingerprint(y_harm), audio_fingerprint(y_per) pitch_ave = np.average(pitches) return float(tempo), float(pitch_ave), float(harm), float(perc), json
def main(): outdir = 'mix' if not os.path.exists(outdir): os.makedirs(outdir) audio_total1, sr = librosa.load('./cao.wav', sr=sample_rate, mono=True) audio_total2, sr = librosa.load('./huang.wav', sr=sample_rate, mono=True) seglen = int(sav_n_secs * sr) len1 = audio_total1.shape[0] - seglen len2 = audio_total2.shape[0] - seglen for i in range(train_data_num): if i % 100 == 0: print(i) idx1=random.randint(0, len1) idx2=random.randint(0, len2) mix(audio_total1[idx1:idx1+seglen], audio_total2[idx2:idx2+seglen], sample_rate, sav_n_secs,outdir,i)
def save_cache(src_path, des_path, get_feature_func): des_path = osp.splitext(des_path)[0] + '.npy' try: X, sr = librosa.load(src_path) src = int(sr) feature = get_feature_func(X, sr) print('[INFO] Saving Cache in {} ...'.format(des_path)) des_par = osp.abspath(osp.join(des_path, osp.pardir)) if not osp.exists(des_par): os.makedirs(des_par) except Exception, e: print("[ERROR] Unkown error happend when dealing with{}".format(src_path)) #print(e) return -1 np.save(des_path, feature) return 0
def adjust_volume(in_fp): def adjust(volume): audio_p = audio + volume fn_p = fn + "_" + str(volume) +"db" + ".wav" fd = audio_p.export(path.join(out_dir, str(volume) + 'db', path.split(in_dir)[-1], fn_p), format=format) in_dir, fn = path.split(in_fp) fn, file_ext = path.splitext(fn) file_ext = file_ext.lower() format = file_ext.replace('.', '') # audio = None y, sr = librosa.load(in_fp, sr=44100) tmp_in_fp = "tmp/" + fn + "_tmp.wav" librosa.output.write_wav(tmp_in_fp, y, sr, norm=False) format = "wav" audio = aseg.from_file(tmp_in_fp, format) os.remove(tmp_in_fp) if audio != None: for v in volume_list: adjust(v)
def log_scale_melspectrogram(path, plot=False): signal, sr = lb.load(path, sr=Fs) n_sample = signal.shape[0] n_sample_fit = int(DURA*Fs) if n_sample < n_sample_fit: signal = np.hstack((signal, np.zeros((int(DURA*Fs) - n_sample,)))) elif n_sample > n_sample_fit: signal = signal[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2] melspect = lb.logamplitude(lb.feature.melspectrogram(y=signal, sr=Fs, hop_length=N_OVERLAP, n_fft=N_FFT, n_mels=N_MELS)**2, ref_power=1.0) if plot: melspect = melspect[np.newaxis, :] misc.imshow(melspect.reshape((melspect.shape[1],melspect.shape[2]))) print(melspect.shape) return melspect
def read_file_pair(filename_pair, mono=True): """ given a pair of file names, read in both waveforms and upsample (through librosa's default interpolation) the downsampled waveform assumes the file name pair is of the form ("original", "downsampled") mono selects whether to read in mono or stereo formatted waveforms returns a pair of numpy arrays representing the original and upsampled waveform """ channel = 1 if mono else 2 true_waveform, true_br = librosa.load(filename_pair[0], sr=None, mono=mono) ds_waveform, _ = librosa.load(filename_pair[1], sr=true_br, mono=mono) # truth, example return true_waveform.reshape((-1, channel)), \ ds_waveform.reshape((-1, channel))
def extract(filename, fft_size=FFT_SIZE, dtype=np.float32): ''' Basic (WORLD) feature extraction ''' x, _ = librosa.load(filename, sr=args.fs, mono=True, dtype=np.float64) features = wav2pw(x, args.fs, fft_size=fft_size) ap = features['ap'] f0 = features['f0'].reshape([-1, 1]) sp = features['sp'] en = np.sum(sp + EPSILON, axis=1, keepdims=True) sp = np.log10(sp / en) return np.concatenate([sp, ap, f0, en], axis=1).astype(dtype)
def extract_features(file_name): X, sample_rate = librosa.load(file_name) stft = np.abs(librosa.stft(X)) mfccs = np.array(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=8).T) chroma = np.array(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T) mel = np.array(librosa.feature.melspectrogram(X, sr=sample_rate).T) contrast = np.array(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T) tonnetz = np.array(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T) return mfccs,chroma,mel,contrast,tonnetz
def _load_from_cache(self): try: return numpy.load(str(self.spectrogram_cache_file)) except ValueError: log("Recalculating cached file {} because loading failed.".format(self.spectrogram_cache_file)) return self._calculate_and_save_spectrogram()
def SIGNAL(): y, sr = librosa.load(librosa.util.example_audio_file(), sr=None) return y, sr
def create_marked_audio_file(mark_locations: Union[List[float], np.ndarray], output_path: Opt[str] = None, *, audio_file: Opt[str] = None, duration: float = None): if audio_file: y, sr = librosa.load(audio_file) marked_audio = librosa.core.clicks(times=mark_locations, sr=sr, length=len(y)) marked_audio = y + marked_audio elif duration: sr = 22050 marked_audio = librosa.core.clicks(times=mark_locations, sr=sr, length=int(sr * duration)) else: raise ParameterError("Must provide either audio file or duration.") librosa.output.write_wav(path=output_path, y=marked_audio, sr=sr) return output_path
def __init__(self, file: str, *, sample_rate: int = 44100): """ Parameters ---------- file Audio file to load """ self.file = file self.samples, self.sample_rate = librosa.load(file, sr=sample_rate) self.duration = librosa.get_duration(y=self.samples, sr=self.sample_rate)
def clip_audio(specs, raw_audio, output): # Load the spec data. In clipping audio, we hold the specs fixed. spec_filenames = next(os.walk(specs))[2] if len(spec_filenames) == 0: print("No specs found.") return for spec_filename in spec_filenames: with open(os.path.join(specs, spec_filename)) as f: spec = json.load(f) youtube_id = spec['audio_source']['youtube_id'] start_time = spec['audio_source']['start_time'] end_time = spec['audio_source']['end_time'] raw_audio_filenames = glob.glob(os.path.join(raw_audio, youtube_id + '.*')) if len(raw_audio_filenames) == 0: # No audio file found, skip. continue raw_audio_filename = raw_audio_filenames[0] raw_audio_extension = os.path.splitext(raw_audio_filename)[1] clip_filename = os.path.join( output, CLIP_NAME_PATTERN.format(youtube_id, start_time, end_time) + raw_audio_extension) # Call ffmpeg to output the trimmed clip. os.makedirs(os.path.dirname(clip_filename), exist_ok=True) call1 = ['ffmpeg', '-loglevel', 'error', '-n', '-ss', str(start_time), '-t', str(end_time - start_time), '-i', raw_audio_filename] if raw_audio_extension == 'ogg': call2 = ['-codec:a', 'libvorbis', '-strict', 'experimental'] else: call2 = [] call3 = [clip_filename] process = subprocess.run(call1 + call2 + call3) if process.returncode != 0: print("Error: {} encountered by {}".format( process.returncode, clip_filename)) else: print(clip_filename)
def test_dtw_aligner(): x, fs = librosa.load(example_audio_file(), sr=None) assert fs == 16000 x_fast = librosa.effects.time_stretch(x, 2.0) X = _get_mcep(x, fs) Y = _get_mcep(x_fast, fs) D = X.shape[-1] # Create padded pair X, Y = adjast_frame_lengths(X, Y, divisible_by=2) # Add utterance axis X = X.reshape(1, -1, D) Y = Y.reshape(1, -1, D) X_aligned, Y_aligned = DTWAligner().transform((X, Y)) assert X_aligned.shape == Y_aligned.shape assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y) X_aligned, Y_aligned = IterativeDTWAligner( n_iter=2, max_iter_gmm=10, n_components_gmm=2).transform((X, Y)) assert X_aligned.shape == Y_aligned.shape assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y) # Custom dist function from nnmnkwii.metrics import melcd X_aligned, Y_aligned = DTWAligner(dist=melcd).transform((X, Y)) assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)
def mfcc_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits): maybe_download(source, DATA_DIR) if target == Target.speaker: speakers = get_speakers() batch_features = [] labels = [] files = os.listdir(path) while True: print("loaded batch of %d files" % len(files)) shuffle(files) for file in files: if not file.endswith(".wav"): continue wave, sr = librosa.load(path+file, mono=True) mfcc = librosa.feature.mfcc(wave, sr) if target==Target.speaker: label=one_hot_from_item(speaker(file), speakers) elif target==Target.digits: label=dense_to_one_hot(int(file[0]),10) elif target==Target.first_letter: label=dense_to_one_hot((ord(file[0]) - 48) % 32,32) elif target == Target.hotword: label = one_hot_word(file, pad_to=max_word_length) # elif target == Target.word: label=string_to_int_word(file, pad_to=max_word_length) # label = file # sparse_labels(file, pad_to=20) # max_output_length else: raise Exception("todo : labels for Target!") labels.append(label) # print(np.array(mfcc).shape) mfcc=np.pad(mfcc,((0,0),(0,80-len(mfcc[0]))), mode='constant', constant_values=0) batch_features.append(np.array(mfcc)) if len(batch_features) >= batch_size: # if target == Target.word: labels = sparse_labels(labels) # labels=np.array(labels) # print(np.array(batch_features).shape) # yield np.array(batch_features), labels # print(np.array(labels).shape) # why (64,) instead of (64, 15, 32)? OK IFF dim_1==const (20) yield batch_features, labels # basic_rnn_seq2seq inputs must be a sequence batch_features = [] # Reset for next batch labels = [] # If you set dynamic_pad=True when calling tf.train.batch the returned batch will be automatically padded with 0s. Handy! A lower-level option is to use tf.PaddingFIFOQueue. # only apply to a subset of all images at one time
def load(self,image_names): print("loading %d images"%len(image_names)) return list(map(self.load_image,image_names)) # python3 map object WTF
def next_batch(self, batch_size, fake_data=False): """Return the next `batch_size` examples from this data set.""" if fake_data: fake_image = [1] * width * height if self.one_hot: fake_label = [1] + [0] * 9 else: fake_label = 0 return [fake_image for _ in xrange(batch_size)], [ fake_label for _ in xrange(batch_size)] start = self._index_in_epoch self._index_in_epoch += batch_size if self._index_in_epoch > self._num_examples: # Finished epoch self._epochs_completed += 1 # Shuffle the data perm = numpy.arange(self._num_examples) numpy.random.shuffle(perm) # self._images = self._images[perm] self._image_names = self._image_names[perm] self._labels = self._labels[perm] # Start next epoch start = 0 self._index_in_epoch = batch_size assert batch_size <= self._num_examples end = self._index_in_epoch return self.load(self._image_names[start:end]), self._labels[start:end] # multi-label
def load_audio(path, sample_length=64000, sr=16000): """Loading of a wave file. Args: path: Location of a wave file to load. sample_length: The truncated total length of the final wave file. sr: Samples per a second. Returns: out: The audio in samples from -1.0 to 1.0 """ audio, _ = librosa.load(path, sr=sr) audio = audio[:sample_length] return audio
def load_generic_audio(directory, sample_rate): """Generator that yields audio waveforms from the directory.""" def randomize_files(fns): for _ in fns: file_index = random.randint(0, len(fns) - 1) yield fns[file_index] files = find_files(directory) id_reg_exp = re.compile(FILE_PATTERN) print("files length: {}".format(len(files))) randomized_files = randomize_files(files) for filename in randomized_files: ids = id_reg_exp.findall(filename) if not ids: # The file name does not match the pattern containing ids, so # there is no id. category_id = None else: # The file name matches the pattern for containing ids. category_id = int(ids[0][0]) audio, _ = librosa.load(filename, sr=sample_rate, mono=True) # Normalize audio audio = librosa.util.normalize(audio) * 0.8 # Trim the last 5 seconds to account for music rollout audio = audio[:-5 * sample_rate] audio = np.reshape(audio, (-1, 1)) yield audio, filename, category_id
def load_wav(wavfile, sr, mono=True): audio, _ = librosa.load(wavfile, sr=sr, mono=mono) # Normalize audio audio = librosa.util.normalize(audio) * 0.8 lc = AudioReader.midi_notes_encoding(audio) fn = os.path.abspath(wavfile).strip('.wav') fn = "{}_lc_embedding.npy".format(fn) with open(fn, 'w') as f: np.save(f, lc)
def create_seed(filename, sample_rate, quantization_channels, window_size): audio, _ = librosa.load(filename, sr=sample_rate, mono=True) quantized = mu_law_encode(audio, quantization_channels) cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size)) return quantized[:cut_index]
def load_lc_embedding(lc_embedding): with open(lc_embedding, 'r') as f: return np.load(f)
def read_wav_file(file): r""" Loads wav files from disk and resamples to 22050 Hz The output is shaped as [timesteps, 1] Parameters ---------- file Returns ------- """ import librosa data, sr = librosa.load(file) return np.expand_dims(data, axis=-1)
def load_sound_files(file_paths): raw_sounds = [] for fp in file_paths: X,sr = librosa.load(fp) raw_sounds.append(X) return raw_sounds
def extract_feature(file_name): X, sample_rate = librosa.load(file_name) stft = np.abs(librosa.stft(X)) mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0) chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0) mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0) contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0) tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0) return mfccs,chroma,mel,contrast,tonnetz
def norm_audio(self): '''Normalize the audio files used before training using a independent script''' for file in self.audiofiles: audio, sr = librosa.load(file, sr=16000) div_fac = 1 / np.max(np.abs(audio)) / 3.0 audio = audio * div_fac librosa.output.write_wav(file, audio, sr) for file in self.noisefiles: audio, sr = librosa.load(file, sr=16000) div_fac = 1 / np.max(np.abs(audio)) / 3.0 audio = audio * div_fac librosa.output.write_wav(file, audio, sr)
def callback(recognizer, audio): try: sentence = recognizer.recognize_google(audio, language=language) wave_file_name = "train.wav" wav_file = open(wave_file_name,"wb") wav_file.write(audio.get_wav_data()) wav_file.close() wave, sample_rate = librosa.load(wave_file_name, mono=True, sr=None) wave = wave[::3] save_recording(wave_file_name,wave,sentence,CSV_BIG_ONE) except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e))
def load_wav_files(files): wav_files = [] for i, f in enumerate(files): print i, f wav_files += [librosa.load(f, sr=SAMPLINGRATE)[0]] return wav_files
def get_spectrograms(sound_file): '''Extracts melspectrogram and log magnitude from given `sound_file`. Args: sound_file: A string. Full path of a sound file. Returns: Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels) Transposed magnitude: A 2d array.Has shape of (T, 1+hp.n_fft//2) ''' # Loading sound file y, sr = librosa.load(sound_file, sr=None) # or set sr to hp.sr. # stft. D: (1+n_fft//2, T) D = librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length) # magnitude spectrogram magnitude = np.abs(D) # (1+n_fft/2, T) # power spectrogram power = magnitude ** 2 # (1+n_fft/2, T) # mel spectrogram S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) # (n_mels, T) return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32)) # (T, n_mels), (T, 1+n_fft/2)
def test_load_sound(self): s1 = Sound.from_file(self.audio_file) y, sr = librosa.load(self.audio_file) s2 = Sound(y, sr) self.assertTrue(numpy.all(s1.y == s2.y)) s3 = Sound(numpy.random.rand(random.randint(1, 100000)), random.choice((88200, 44100, 22050, 11025)))
def from_file(cls, filename, sr=22050): """ Loads an audiofile, uses sr=22050 by default. """ y, sr = librosa.load(filename, sr=sr) return cls(y, sr) # Chunk iterator
def parse_wav(filename, n_mfcc=40): ''' Parses a single wav file into MFCC's and sample rate. Arguments: filename - Name of input wav file. n_mfcc - Number of coefficients to use. Returns: A tuple with a numpy array with cepstrum coefficients, and sample rate. Raises: ''' song_data = np.array([]) sample_rate = -1 if filename[-4:] == '.wav': try: y_data, sample_rate = librosa.load(filename) # will need to experiment with different values for n_mfcc song_data = librosa.feature.mfcc(y=y_data, sr=sample_rate, n_mfcc=n_mfcc) except: sys.exit(1) return (song_data, sample_rate)
def compute_spectrograms(filename): out_rate = 22050 frames, rate = librosa.load(filename, sr=out_rate, mono=True) if len(frames) < out_rate: # if less then 1 second - can't process raise Exception("Audio duration is too short") normalized_audio = _normalize(frames) melspectr = librosa.feature.melspectrogram(y=normalized_audio, sr=out_rate, n_mels=N_MEL_BANDS, fmax=out_rate/2) logmelspectr = librosa.logamplitude(melspectr**2, ref_power=1.0) # now going through spectrogram with the stride of the segment duration for start_idx in range(0, logmelspectr.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR): yield logmelspectr[:, start_idx:start_idx + SEGMENT_DUR]
def get_feature_aqibsaeed_1(X, sr, au_path=None): """ http://aqibsaeed.github.io/2016-09-03-urban-sound-classification-part-1/ """ import librosa if au_path is not None: X, sr = librosa.load(au_path) stft = np.abs(librosa.stft(X)) mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0) chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T,axis=0) mel = np.mean(librosa.feature.melspectrogram(X, sr=sr).T,axis=0) contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T,axis=0) tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sr).T,axis=0) feature = np.hstack([mfccs,chroma,mel,contrast,tonnetz]) return feature
def __init__(self, cache=None, **kwargs): super(GTZAN, self).__init__(**kwargs) if kwargs.get('conf') is not None: conf = kwargs['conf'] cache = conf.get('cache', None) data_set_path = osp.join(DEFAULT_IMAGEST_BASE, self.data_set) self.data_set_path = data_set_path self.cache = cache X, y = parse_anno_file(data_set_path) if cache == 'raw': import librosa from tqdm import trange X_new = np.zeros((len(X), 1, 661500, 1)) for i in trange(len(X)): x,_ = librosa.load(osp.join(DEFAULT_DATA_BASE, X[i])) x_len = min(661500, len(x)) X_new[i,:,:x_len,0] = x[:x_len] if cache is not None and cache != 'raw': X = self.load_cache_X(X, cache) if cache == 'mfcc': X_new = np.zeros((len(X), X[0].shape[0], 1280, 1)) for i, x in enumerate(X): x_len = min(x.shape[1], 1280) X_new[i,:,:x_len,0] = x[:,:x_len] X = X_new # layout_X if self.layout_x == 'rel_path': self.X = X else: self.X = self.init_layout_X(X) # layout_y self.y = self.init_layout_y(y)