def getdata(self):
    # Structure for the array of songs
    song_data = []
    genre_data = []

    # Read files from the folders
    for x,_ in self.genres.items():
      for root, subdirs, files in os.walk(self.file_path + x):
        for file in files:
          # Read the audio file
            file_name = self.file_path + x + "/" + file
            signal, sr = librosa.load(file_name)

            # Calculate the melspectrogram of the audio and use log scale
            melspec = librosa.feature.melspectrogram(signal[:self.song_samples],
              sr = sr, n_fft = self.n_fft, hop_length = self.hop_length).T[:1280,]

            # Append the result to the data structure
    return np.array(song_data), keras.utils.to_categorical(genre_data, len(self.genres))
def __init__(self,
                 audio_file: Path,
                 id: Optional[str] = None,
                 sample_rate_to_convert_to: int = 16000,
                 label: Optional[str] = "nolabel",
                 fourier_window_length: int = 512,
                 hop_length: int = 128,
                 mel_frequency_count: int = 128,
                 label_with_tags: str = None,
                 positional_label: Optional[PositionalLabel] = None):
        # The default values for hop_length and fourier_window_length are powers of 2 near the values specified in the wave2letter paper.

        if id is None:
            id = name_without_extension(audio_file)

        self.audio_file = audio_file

            id=id, get_raw_audio=lambda: librosa.load(str(self.audio_file), sr=self.sample_rate)[0],
            label=label, sample_rate=sample_rate_to_convert_to,
            fourier_window_length=fourier_window_length, hop_length=hop_length, mel_frequency_count=mel_frequency_count,
            label_with_tags=label_with_tags, positional_label=positional_label)
def __init__(self, images, labels, fake_data=False, one_hot=False, load=False):
        """Construct a DataSet. one_hot arg is used only if fake_data is true."""
        if fake_data:
            self._num_examples = 10000
            self.one_hot = one_hot
            num = len(images)
            assert num == len(labels), ('images.shape: %s labels.shape: %s' % (images.shape, labels.shape))
            print("len(images) %d" % num)
            self._num_examples = num
        self._image_names = numpy.array(images)
        self._labels = labels
        self._epochs_completed = 0
        self._index_in_epoch = 0
        if load: # Otherwise loaded on demand
def read_data_sets(train_dir,source_data=Source.NUMBER_IMAGES, fake_data=False, one_hot=True):
    class DataSets(object):
    data_sets = DataSets()
    if fake_data:
        data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot)
        data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot)
        data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot)
        return data_sets
    local_file = maybe_download(source_data, train_dir)
    train_images = extract_images(TRAIN_INDEX,train=True)
    train_labels = extract_labels(TRAIN_INDEX,train=True, one_hot=one_hot)
    test_images = extract_images(TEST_INDEX,train=False)
    test_labels = extract_labels(TEST_INDEX,train=False, one_hot=one_hot)
    # train_images = train_images[:VALIDATION_SIZE]
    # train_labels = train_labels[:VALIDATION_SIZE:]
    # test_images = test_images[VALIDATION_SIZE:]
    # test_labels = test_labels[VALIDATION_SIZE:]
    data_sets.train = DataSet(train_images, train_labels , load=False)
    data_sets.test = DataSet(test_images, test_labels, load=True)
    # data_sets.validation = DataSet(validation_images, validation_labels, load=True)
    return data_sets
def load_audio(audio_filename, sample_rate):
  """Loads an audio file.

    audio_filename: File path to load.
    sample_rate: The number of samples per second at which the audio will be
        returned. Resampling will be performed if necessary.

    A numpy array of audio samples, single-channel (mono) and sampled at the
    specified rate, in float32 format.

    AudioIOReadException: If librosa is unable to load the audio data.
    y, unused_sr = librosa.load(audio_filename, sr=sample_rate, mono=True)
  except Exception as e:  # pylint: disable=broad-except
    raise AudioIOReadException(e)
  return y
def get_seeds(self, audio_filepath):
        """Get the seeds file to pass to the HLL tracker.

        audio_filepath : str
            Path to audio file.

        seeds_fpath : str
            Path to the seeds output file.

        y, sr = librosa.load(audio_filepath, sr=44100)
        y_harmonic = librosa.effects.harmonic(y)
        cqt, samples, freqs = self._compute_cqt(y_harmonic, sr)
        seeds = self._pick_seeds_cqt(cqt, freqs, samples)

        seeds_fpath = tmp.mktemp('.csv')
        with open(seeds_fpath, 'w') as fhandle:
            writer = csv.writer(fhandle, delimiter=',')
        return seeds_fpath
def compute_spectrograms(filename):
    out_rate = 12000
    N_FFT = 512
    HOP_LEN = 256

    frames, rate = librosa.load(filename, sr=out_rate, mono=True)
    if len(frames) < out_rate*3:
        # if less then 3 second - can't process
        raise Exception("Audio duration is too short")

    logam = librosa.logamplitude
    melgram = librosa.feature.melspectrogram
    x = logam(melgram(y=frames, sr=out_rate, hop_length=HOP_LEN,
                      n_fft=N_FFT, n_mels=N_MEL_BANDS) ** 2,

    # now going through spectrogram with the stride of the segment duration
    for start_idx in range(0, x.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR):
        yield x[:, start_idx:start_idx + SEGMENT_DUR]
def predict_on_long_clips():
    """Load the saved model and perform inference/prediction on features obtained from inputs. 
    Splits the audio into 10second chunks and predicts on those chunks."""
    with open(FILENAMES,"r") as fh:
        filenames=filenames[:5] #[:5] is for quickly verifying if things work
        filenames = [DATASET_LOCATION+f for f in filenames]

    session = tf.Session()
    saver = tf.train.import_meta_graph(IMPORT_META_GRAPH)
    saver.restore(session, tf.train.latest_checkpoint(IMPORT_LATEST_CHECKPOINT))

    test_x = {}
    for f in filenames:
        s, sr = librosa.load(f)
        total_chunks = s.shape[0]/max_audio_length
        waveforms = [s[max_audio_length*i:max_audio_length*(i+1)] for i in range(total_chunks)]
        test_x[f] = extract_features_from_waveforms(waveforms)

        print "FILENAME: ", f
        predictions =, 1), feed_dict={X: test_x[f]})
        print [possible_categories[p] for p in predictions]
def loadFile(self, fname):
        fname:      filename of the sound file we want to load
        if self.verbose: print('Loading %s' % fname)

        if self.cached:
            if not os.path.exists(fname + '-mfcc.npy'):
                y, sr = librosa.load(fname)
                data = mfcc(y=y, sr=sr).T
       + '-mfcc.npy', data)
                data = np.load(fname + '-mfcc.npy')
            y, sr = librosa.load(fname)
            # TODO: Add ability to filter by seconds/duration
            # seconds = y.size/sr
            data = mfcc(y=y, sr=sr).T

        return data
def get_mfccs_and_deltas(wav_pathname, n_mfcc=13, n_fft=2048, freq_min=100, freq_max=16000):
    sample_array, sample_rate = librosa.load(wav_pathname, sr=44100)
    if len(sample_array) == 0:
        return []
        mfcc = librosa.feature.mfcc(sample_array, sample_rate, n_fft=n_fft, hop_length=n_fft, n_mfcc=n_mfcc, fmin=freq_min, fmax=freq_max)
        delta =
        delta2 =, order=2)
        mfcc = mfcc.T  ### Transposing tables
        delta = delta.T  ## (We can instead set the axis above to do this without the extra step)
        delta2 = delta2.T
        mfcc_sans_0th = [frame_values[1:] for frame_values in mfcc]
        all_features = []
        for i in range(len(mfcc)):
            all_features.append(list(mfcc_sans_0th[i]) + list(delta[i]) + list(delta2[i]))
        return all_features
def recognise_mfcc(filePath,outputDir,outputName,debug):

    print("start decompose harmonic/percussive and extract mfcc {0}".format(filePath))
    y,sr = librosa.load(filePath)
    mfcc = librosa.feature.mfcc(y=y,sr=sr)
    mfcc = np.transpose(mfcc)
    basePath = outputDir+outputName;
    harmonic_sep = 3.0
    percussive_sep = 3.0
    h,p = librosa.effects.hpss(y,margin=(harmonic_sep,percussive_sep))
    hmfcc = librosa.feature.mfcc(y=h,sr=sr)
    hmfcc = np.transpose(hmfcc)
    pmfcc = librosa.feature.mfcc(y=p,sr=sr)
    pmfcc = np.transpose(pmfcc)

# extract rhythm patter with rp_extract
def load_generic_audio(directory, sample_rate):
    '''Generator that yields audio waveforms from the directory.'''
    files = find_files(directory)
    id_reg_exp = re.compile(FILE_PATTERN)
    print("files length: {}".format(len(files)))
    randomized_files = randomize_files(files)
    for filename in randomized_files:
        ids = id_reg_exp.findall(filename)
        if not ids:
            # The file name does not match the pattern containing ids, so
            # there is no id.
            category_id = None
            # The file name matches the pattern for containing ids.
            category_id = int(ids[0][0])
        audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
        audio = audio.reshape(-1, 1)
        yield audio, filename, category_id
def get_audio_analysis(song_url):
    if(song_url is None):
        return None, None, None, None, None
    urlretrieve(song_url, "current.mp3")
    y, sr = librosa.load("./current.mp3")

    # Tempo = beats/minute
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)

    # pitch = Frequency
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr,
                                           fmax=1000, hop_length=1000)

    pitches, magnitudes = extract_max(pitches, magnitudes, pitches.shape)
    y[abs(y) < 10**-2] = 0
    y = np.trim_zeros(y)

    json = {
        'sound_wave': np.array(y[:len(pitches)]).tolist(),
        'pitch': pitches
    y_harm, y_per = librosa.effects.hpss(y)
    harm, perc = audio_fingerprint(y_harm), audio_fingerprint(y_per)
    pitch_ave = np.average(pitches)
    return float(tempo), float(pitch_ave), float(harm), float(perc), json
def main():
    outdir = 'mix'
    if not os.path.exists(outdir):
    audio_total1, sr = librosa.load('./cao.wav', sr=sample_rate, mono=True)
    audio_total2, sr = librosa.load('./huang.wav', sr=sample_rate, mono=True)

    seglen = int(sav_n_secs * sr)

    len1 = audio_total1.shape[0] - seglen
    len2 = audio_total2.shape[0] - seglen

    for i in range(train_data_num):
      if i % 100 == 0:
      idx1=random.randint(0, len1)
      idx2=random.randint(0, len2)
      mix(audio_total1[idx1:idx1+seglen], audio_total2[idx2:idx2+seglen], sample_rate, sav_n_secs,outdir,i)
def save_cache(src_path, des_path, get_feature_func):
    des_path = osp.splitext(des_path)[0] + '.npy'
        X, sr = librosa.load(src_path)
        src = int(sr)
        feature = get_feature_func(X, sr)
        print('[INFO] Saving Cache in {} ...'.format(des_path))
        des_par = osp.abspath(osp.join(des_path, osp.pardir))
        if not osp.exists(des_par):
    except Exception, e:
        print("[ERROR] Unkown error happend when dealing with{}".format(src_path))
        return -1, feature)
    return 0
def adjust_volume(in_fp):
    def adjust(volume):
        audio_p = audio + volume
        fn_p = fn + "_" + str(volume) +"db" + ".wav"
        fd = audio_p.export(path.join(out_dir, str(volume) + 'db', path.split(in_dir)[-1], fn_p), format=format)

    in_dir, fn = path.split(in_fp)
    fn, file_ext = path.splitext(fn)
    file_ext = file_ext.lower()
    format = file_ext.replace('.', '')
    # audio = None
    y, sr = librosa.load(in_fp, sr=44100)
    tmp_in_fp = "tmp/" + fn + "_tmp.wav"
    librosa.output.write_wav(tmp_in_fp, y, sr, norm=False)
    format = "wav"
    audio = aseg.from_file(tmp_in_fp, format)

    if audio != None:
        for v in volume_list:
def log_scale_melspectrogram(path, plot=False):
    signal, sr = lb.load(path, sr=Fs)
    n_sample = signal.shape[0]
    n_sample_fit = int(DURA*Fs)

    if n_sample < n_sample_fit:
        signal = np.hstack((signal, np.zeros((int(DURA*Fs) - n_sample,))))
    elif n_sample > n_sample_fit:
        signal = signal[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]

    melspect = lb.logamplitude(lb.feature.melspectrogram(y=signal, sr=Fs, hop_length=N_OVERLAP, n_fft=N_FFT, n_mels=N_MELS)**2, ref_power=1.0)

    if plot:
        melspect = melspect[np.newaxis, :]

    return melspect
def read_file_pair(filename_pair, mono=True):
    given a pair of file names, read in both waveforms and upsample (through
    librosa's default interpolation) the downsampled waveform
    assumes the file name pair is of the form ("original", "downsampled")
    mono selects whether to read in mono or stereo formatted waveforms

    returns a pair of numpy arrays representing the original and upsampled
    channel = 1 if mono else 2
    true_waveform, true_br = librosa.load(filename_pair[0], sr=None,
    ds_waveform, _ = librosa.load(filename_pair[1], sr=true_br, mono=mono)
    # truth, example
    return true_waveform.reshape((-1, channel)), \
        ds_waveform.reshape((-1, channel))
def extract(filename, fft_size=FFT_SIZE, dtype=np.float32):
    ''' Basic (WORLD) feature extraction ''' 
    x, _ = librosa.load(filename, sr=args.fs, mono=True, dtype=np.float64)
    features = wav2pw(x, args.fs, fft_size=fft_size)
    ap = features['ap']
    f0 = features['f0'].reshape([-1, 1])
    sp = features['sp']
    en = np.sum(sp + EPSILON, axis=1, keepdims=True)
    sp = np.log10(sp / en)
    return np.concatenate([sp, ap, f0, en], axis=1).astype(dtype)
def extract_features(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.array(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=8).T)
    chroma = np.array(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T)
    mel = np.array(librosa.feature.melspectrogram(X, sr=sample_rate).T)
    contrast = np.array(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T)
    tonnetz = np.array(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T)
    return mfccs,chroma,mel,contrast,tonnetz
def _load_from_cache(self):
            return numpy.load(str(self.spectrogram_cache_file))
        except ValueError:
            log("Recalculating cached file {} because loading failed.".format(self.spectrogram_cache_file))
            return self._calculate_and_save_spectrogram()
项目:crema    作者:bmcfee    | 项目源码 | 文件源码
    y, sr = librosa.load(librosa.util.example_audio_file(),
    return y, sr
项目:crema    作者:bmcfee    | 项目源码 | 文件源码
    y, sr = librosa.load(librosa.util.example_audio_file(),
    return y, sr
def create_marked_audio_file(mark_locations: Union[List[float], np.ndarray], output_path: Opt[str] = None, *,
                             audio_file: Opt[str] = None, duration: float = None):
    if audio_file:
        y, sr = librosa.load(audio_file)
        marked_audio = librosa.core.clicks(times=mark_locations, sr=sr, length=len(y))
        marked_audio = y + marked_audio
    elif duration:
        sr = 22050
        marked_audio = librosa.core.clicks(times=mark_locations, sr=sr, length=int(sr * duration))
        raise ParameterError("Must provide either audio file or duration.")

    librosa.output.write_wav(path=output_path, y=marked_audio, sr=sr)

    return output_path
def __init__(self, file: str, *, sample_rate: int = 44100):
            Audio file to load

        self.file = file
        self.samples, self.sample_rate = librosa.load(file, sr=sample_rate)
        self.duration = librosa.get_duration(y=self.samples, sr=self.sample_rate)
项目:aurora    作者:caretcaret    | 项目源码 | 文件源码
  # Load the spec data. In clipping audio, we hold the specs fixed.
  spec_filenames = next(os.walk(specs))[2]
  if len(spec_filenames) == 0:
    print("No specs found.")
  for spec_filename in spec_filenames:
    with open(os.path.join(specs, spec_filename)) as f:
      spec = json.load(f)
    youtube_id = spec['audio_source']['youtube_id']
    start_time = spec['audio_source']['start_time']
    end_time = spec['audio_source']['end_time']

    raw_audio_filenames = glob.glob(os.path.join(raw_audio, youtube_id + '.*'))
    if len(raw_audio_filenames) == 0:
      # No audio file found, skip.
    raw_audio_filename = raw_audio_filenames[0]
    raw_audio_extension = os.path.splitext(raw_audio_filename)[1]
    clip_filename = os.path.join(
        output, CLIP_NAME_PATTERN.format(youtube_id, start_time, end_time) +

    # Call ffmpeg to output the trimmed clip.
    os.makedirs(os.path.dirname(clip_filename), exist_ok=True)
    call1 = ['ffmpeg', '-loglevel', 'error', '-n',
             '-ss', str(start_time), '-t', str(end_time - start_time),
             '-i', raw_audio_filename]
    if raw_audio_extension == 'ogg':
      call2 = ['-codec:a', 'libvorbis', '-strict', 'experimental']
      call2 = []
    call3 = [clip_filename]
    process = + call2 + call3)
    if process.returncode != 0:
      print("Error: {} encountered by {}".format(
          process.returncode, clip_filename))
def test_dtw_aligner():
    x, fs = librosa.load(example_audio_file(), sr=None)
    assert fs == 16000
    x_fast = librosa.effects.time_stretch(x, 2.0)

    X = _get_mcep(x, fs)
    Y = _get_mcep(x_fast, fs)

    D = X.shape[-1]

    # Create padded pair
    X, Y = adjast_frame_lengths(X, Y, divisible_by=2)

    # Add utterance axis
    X = X.reshape(1, -1, D)
    Y = Y.reshape(1, -1, D)

    X_aligned, Y_aligned = DTWAligner().transform((X, Y))
    assert X_aligned.shape == Y_aligned.shape
    assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)

    X_aligned, Y_aligned = IterativeDTWAligner(
        n_iter=2, max_iter_gmm=10, n_components_gmm=2).transform((X, Y))
    assert X_aligned.shape == Y_aligned.shape
    assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)

    # Custom dist function
    from nnmnkwii.metrics import melcd
    X_aligned, Y_aligned = DTWAligner(dist=melcd).transform((X, Y))
    assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)
def mfcc_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits):
    maybe_download(source, DATA_DIR)
    if target == Target.speaker: speakers = get_speakers()
    batch_features = []
    labels = []
    files = os.listdir(path)
    while True:
        print("loaded batch of %d files" % len(files))
        for file in files:
            if not file.endswith(".wav"): continue
            wave, sr = librosa.load(path+file, mono=True)
            mfcc = librosa.feature.mfcc(wave, sr)
            if target==Target.speaker: label=one_hot_from_item(speaker(file), speakers)
            elif target==Target.digits:  label=dense_to_one_hot(int(file[0]),10)
            elif target==Target.first_letter:  label=dense_to_one_hot((ord(file[0]) - 48) % 32,32)
            elif target == Target.hotword: label = one_hot_word(file, pad_to=max_word_length)  #
            elif target == Target.word: label=string_to_int_word(file, pad_to=max_word_length)
                # label = file  # sparse_labels(file, pad_to=20)  # max_output_length
            else: raise Exception("todo : labels for Target!")
            # print(np.array(mfcc).shape)
            mfcc=np.pad(mfcc,((0,0),(0,80-len(mfcc[0]))), mode='constant', constant_values=0)
            if len(batch_features) >= batch_size:
                # if target == Target.word:  labels = sparse_labels(labels)
                # labels=np.array(labels)
                # print(np.array(batch_features).shape)
                # yield np.array(batch_features), labels
                # print(np.array(labels).shape) # why (64,) instead of (64, 15, 32)? OK IFF dim_1==const (20)
                yield batch_features, labels  # basic_rnn_seq2seq inputs must be a sequence
                batch_features = []  # Reset for next batch
                labels = []

# If you set dynamic_pad=True when calling tf.train.batch the returned batch will be automatically padded with 0s. Handy! A lower-level option is to use tf.PaddingFIFOQueue.
# only apply to a subset of all images at one time
项目:skill-voice-recognition    作者:TREE-Edu    | 项目源码 | 文件源码
        print("loading %d images"%len(image_names))
        return list(map(self.load_image,image_names)) # python3 map object WTF
项目:skill-voice-recognition    作者:TREE-Edu    | 项目源码 | 文件源码
def next_batch(self, batch_size, fake_data=False):
        """Return the next `batch_size` examples from this data set."""
        if fake_data:
            fake_image = [1] * width * height
            if self.one_hot:
                fake_label = [1] + [0] * 9
                fake_label = 0
            return [fake_image for _ in xrange(batch_size)], [
                    fake_label for _ in xrange(batch_size)]
        start = self._index_in_epoch
        self._index_in_epoch += batch_size
        if self._index_in_epoch > self._num_examples:
            # Finished epoch
            self._epochs_completed += 1
            # Shuffle the data
            perm = numpy.arange(self._num_examples)
            # self._images = self._images[perm]
            self._image_names = self._image_names[perm]
            self._labels = self._labels[perm]
            # Start next epoch
            start = 0
            self._index_in_epoch = batch_size
            assert batch_size <= self._num_examples
        end = self._index_in_epoch
        return self.load(self._image_names[start:end]), self._labels[start:end]

# multi-label
def load_audio(path, sample_length=64000, sr=16000):
  """Loading of a wave file.

    path: Location of a wave file to load.
    sample_length: The truncated total length of the final wave file.
    sr: Samples per a second.

    out: The audio in samples from -1.0 to 1.0
  audio, _ = librosa.load(path, sr=sr)
  audio = audio[:sample_length]
  return audio
def load_generic_audio(directory, sample_rate):
    """Generator that yields audio waveforms from the directory."""

    def randomize_files(fns):
        for _ in fns:
            file_index = random.randint(0, len(fns) - 1)
            yield fns[file_index]

    files = find_files(directory)
    id_reg_exp = re.compile(FILE_PATTERN)
    print("files length: {}".format(len(files)))
    randomized_files = randomize_files(files)
    for filename in randomized_files:
        ids = id_reg_exp.findall(filename)
        if not ids:
            # The file name does not match the pattern containing ids, so
            # there is no id.
            category_id = None
            # The file name matches the pattern for containing ids.
            category_id = int(ids[0][0])
        audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
        # Normalize audio
        audio = librosa.util.normalize(audio) * 0.8
        # Trim the last 5 seconds to account for music rollout
        audio = audio[:-5 * sample_rate]
        audio = np.reshape(audio, (-1, 1))
        yield audio, filename, category_id
def load_wav(wavfile, sr, mono=True):
    audio, _ = librosa.load(wavfile, sr=sr, mono=mono)
    # Normalize audio
    audio = librosa.util.normalize(audio) * 0.8
    lc = AudioReader.midi_notes_encoding(audio)

    fn = os.path.abspath(wavfile).strip('.wav')
    fn = "{}_lc_embedding.npy".format(fn)
    with open(fn, 'w') as f:, lc)
def create_seed(filename,
    audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    quantized = mu_law_encode(audio, quantization_channels)
    cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size),
                        lambda: tf.size(quantized),
                        lambda: tf.constant(window_size))

    return quantized[:cut_index]
def load_lc_embedding(lc_embedding):
    with open(lc_embedding, 'r') as f:
        return np.load(f)
def read_wav_file(file):
    Loads wav files from disk and resamples to 22050 Hz
    The output is shaped as [timesteps, 1]


    import librosa
    data, sr = librosa.load(file)
    return np.expand_dims(data, axis=-1)
def load_sound_files(file_paths):
    raw_sounds = []
    for fp in file_paths:
        X,sr = librosa.load(fp)
项目:TensorFlow_AudioSet_Example    作者:DantesLegacy    | 项目源码 | 文件源码
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz
def norm_audio(self):
        '''Normalize the audio files
        used before training using a independent script'''
        for file in self.audiofiles:
            audio, sr = librosa.load(file, sr=16000)
            div_fac = 1 / np.max(np.abs(audio)) / 3.0
            audio = audio * div_fac
            librosa.output.write_wav(file, audio, sr)
        for file in self.noisefiles:
            audio, sr = librosa.load(file, sr=16000)
            div_fac = 1 / np.max(np.abs(audio)) / 3.0
            audio = audio * div_fac
            librosa.output.write_wav(file, audio, sr)
def callback(recognizer, audio):
        sentence = recognizer.recognize_google(audio, language=language)
        wave_file_name = "train.wav"
        wav_file = open(wave_file_name,"wb")
        wave, sample_rate = librosa.load(wave_file_name, mono=True, sr=None)
        wave = wave[::3]

    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))
def load_wav_files(files):
    wav_files = []
    for i, f in enumerate(files):
        print i, f
        wav_files += [librosa.load(f, sr=SAMPLINGRATE)[0]]
    return wav_files
def get_spectrograms(sound_file):
    '''Extracts melspectrogram and log magnitude from given `sound_file`.
      sound_file: A string. Full path of a sound file.

      Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels)
      Transposed magnitude: A 2d array.Has shape of (T, 1+hp.n_fft//2)
    # Loading sound file
    y, sr = librosa.load(sound_file, sr=None)  # or set sr to

    # stft. D: (1+n_fft//2, T)
    D = librosa.stft(y=y,

    # magnitude spectrogram
    magnitude = np.abs(D)  # (1+n_fft/2, T)

    # power spectrogram
    power = magnitude ** 2  # (1+n_fft/2, T)

    # mel spectrogram
    S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels)  # (n_mels, T)

    return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32))  # (T, n_mels), (T, 1+n_fft/2)
def test_load_sound(self):
        s1 = Sound.from_file(self.audio_file)

        y, sr = librosa.load(self.audio_file)
        s2 = Sound(y, sr)

        self.assertTrue(numpy.all(s1.y == s2.y))

        s3 = Sound(numpy.random.rand(random.randint(1, 100000)),
                   random.choice((88200, 44100, 22050, 11025)))
项目:aupyom    作者:pierre-rouanet    | 项目源码 | 文件源码
def from_file(cls, filename, sr=22050):
        """ Loads an audiofile, uses sr=22050 by default. """
        y, sr = librosa.load(filename, sr=sr)
        return cls(y, sr)

    # Chunk iterator
def parse_wav(filename, n_mfcc=40):
    Parses a single wav file into MFCC's and sample rate.

        filename - Name of input wav file.
        n_mfcc   - Number of coefficients to use.

        A tuple with a numpy array with cepstrum coefficients, and sample rate.



    song_data = np.array([])
    sample_rate = -1
    if filename[-4:] == '.wav':
            y_data, sample_rate = librosa.load(filename)
            #  will need to experiment with different values for n_mfcc
            song_data = librosa.feature.mfcc(y=y_data,

    return (song_data, sample_rate)
def compute_spectrograms(filename):
    out_rate = 22050

    frames, rate = librosa.load(filename, sr=out_rate, mono=True)
    if len(frames) < out_rate:
        # if less then 1 second - can't process
        raise Exception("Audio duration is too short")

    normalized_audio = _normalize(frames)
    melspectr = librosa.feature.melspectrogram(y=normalized_audio, sr=out_rate, n_mels=N_MEL_BANDS, fmax=out_rate/2)
    logmelspectr = librosa.logamplitude(melspectr**2, ref_power=1.0)

    # now going through spectrogram with the stride of the segment duration
    for start_idx in range(0, logmelspectr.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR):
        yield logmelspectr[:, start_idx:start_idx + SEGMENT_DUR]
def get_feature_aqibsaeed_1(X, sr, au_path=None):
    import librosa
    if au_path is not None:
        X, sr = librosa.load(au_path)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sr).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sr).T,axis=0)
    feature = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
    return feature
def __init__(self, cache=None, **kwargs):
        super(GTZAN, self).__init__(**kwargs)
        if kwargs.get('conf') is not None:
            conf = kwargs['conf']
            cache = conf.get('cache', None)
        data_set_path = osp.join(DEFAULT_IMAGEST_BASE, self.data_set)
        self.data_set_path = data_set_path
        self.cache = cache
        X, y = parse_anno_file(data_set_path)
        if cache == 'raw':
            import librosa
            from tqdm import trange
            X_new = np.zeros((len(X), 1, 661500, 1))
            for i in trange(len(X)):
                x,_ = librosa.load(osp.join(DEFAULT_DATA_BASE, X[i]))
                x_len = min(661500, len(x))
                X_new[i,:,:x_len,0] = x[:x_len]
        if cache is not None and cache != 'raw':
            X = self.load_cache_X(X, cache)
            if cache == 'mfcc':
                X_new = np.zeros((len(X), X[0].shape[0], 1280, 1))
                for i, x in enumerate(X):
                    x_len = min(x.shape[1], 1280)
                    X_new[i,:,:x_len,0] = x[:,:x_len]
                X = X_new

        # layout_X
        if self.layout_x == 'rel_path':
            self.X = X
            self.X = self.init_layout_X(X)
        # layout_y
        self.y = self.init_layout_y(y)