import torch import librosa import numpy as np import soundfile as sf from scipy.io import wavfile from IPython.display import Audio from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-100h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-100h") file_name = 'my-audio.wav' data = wavfile.read(file_name) framerate = data[0] sounddata = data[1] time = np.arange(0,len(sounddata))/framerate input_audio, _ = librosa.load(file_name, sr=16000) input_values = tokenizer(input_audio, return_tensors="pt").input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = tokenizer.batch_decode(predicted_ids)[0] print(transcription)