diff --git a/main.py b/main.py new file mode 100644 index 0000000..f07fe9f --- /dev/null +++ b/main.py @@ -0,0 +1,24 @@ +import torch +import librosa +import numpy as np +import soundfile as sf +from scipy.io import wavfile +from IPython.display import Audio +from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer + + +tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") +model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") + +file_name = 'my-audio.wav' + +data = wavfile.read(file_name) +framerate = data[0] +sounddata = data[1] +time = np.arange(0,len(sounddata))/framerate +input_audio, _ = librosa.load(file_name, sr=16000) +input_values = tokenizer(input_audio, return_tensors="pt").input_values +logits = model(input_values).logits +predicted_ids = torch.argmax(logits, dim=-1) +transcription = tokenizer.batch_decode(predicted_ids)[0] +print(transcription) diff --git a/my-audio.wav b/my-audio.wav new file mode 100644 index 0000000..6bd16c8 Binary files /dev/null and b/my-audio.wav differ