from subtoaudio import SubToAudio
sub = SubToAudio(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
subtitle = sub.subtitle("texts/1-1.srt")
sub.convert_to_audio(sub_data=subtitle, output_path="subtitle3.wav", language="ru")
tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
Using model: xtts
ffmpeg version 2023-11-28-git-47e214245b-full_build-www.gyan.dev Copyright (c) 2000-2023 the FFmpeg developers
built with gcc 12.2.0 (Rev10, Built by MSYS2 project)
configuration: --enable-gpl --enable-version3 --enable-static --pkg-config=pkgconf --disable-w32threads --disable-autodetect --enable-fontconfig --enable-iconv --enable-gnutls --enable-libxml2 --enable-gmp --enable-bzlib --enable-lzma --enable-libsnappy --enable-zlib --enable-librist --enable-libsrt --enable-libssh --enable-libzmq --enable-avisynth --enable-libbluray --enable-libcaca --enable-sdl2 --enable-libaribb24 --enable-libaribcaption --enable-libdav1d --enable-libdavs2 --enable-libuavs3d --enable-libzvbi --enable-librav1e --enable-libsvtav1 --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxavs2 --enable-libxvid --enable-libaom --enable-libjxl --enable-libopenjpeg --enable-libvpx --enable-mediafoundation --enable-libass --enable-frei0r --enable-libfreetype --enable-libfribidi --enable-libharfbuzz --enable-liblensfun --enable-libvidstab --enable-libvmaf --enable-libzimg --enable-amf --enable-cuda-llvm --enable-cuvid --enable-ffnvcodec --enable-nvdec --enable-nvenc --enable-dxva2 --enable-d3d11va --enable-libvpl --enable-libshaderc --enable-vulkan --enable-libplacebo --enable-opencl --enable-libcdio --enable-libgme --enable-libmodplug --enable-libopenmpt --enable-libopencore-amrwb --enable-libmp3lame --enable-libshine --enable-libtheora --enable-libtwolame --enable-libvo-amrwbenc --enable-libcodec2 --enable-libilbc --enable-libgsm --enable-libopencore-amrnb --enable-libopus --enable-libspeex --enable-libvorbis --enable-ladspa --enable-libbs2b --enable-libflite --enable-libmysofa --enable-librubberband --enable-libsoxr --enable-chromaprint
libavutil 58. 32.100 / 58. 32.100
libavcodec 60. 35.100 / 60. 35.100
libavformat 60. 18.100 / 60. 18.100
libavdevice 60. 4.100 / 60. 4.100
libavfilter 9. 14.100 / 9. 14.100
libswscale 7. 6.100 / 7. 6.100
libswresample 4. 13.100 / 4. 13.100
libpostproc 57. 4.100 / 57. 4.100
Input #0, srt, from 'texts/1-1.srt':
Duration: N/A, bitrate: N/A
Stream #0:0: Subtitle: subrip
Output #0, srt, to 'C:\Users\idres\AppData\Local\Temp\tmpnyfz59xi.srt':
Metadata:
encoder : Lavf60.18.100
Stream #0:0: Subtitle: subrip
Metadata:
encoder : Lavc60.35.100 srt
Stream mapping:
Stream #0:0 -> #0:0 (subrip (srt) -> subrip (srt))
Press [q] to stop, [?] for help
[out#0/srt @ 000001c14f583480] video:0kB audio:0kB subtitle:1kB other streams:0kB global headers:0kB muxing overhead: 28.727885%
size= 1kB time=00:00:36.96 bitrate= 0.3kbits/s speed=5.14e+04x
Temporary folder: C:\Users\idres\AppData\Local\Temp\tmppqcheu9m
Text splitted to sentences.
['Привет всем, сегодня мы рассмотрим Warhammer 40000 Rogue Traider']
Traceback (most recent call last):
File "F:\whisper\tts.py", line 59, in
sub.convert_to_audio(sub_data=subtitle, output_path="subtitle3.wav", language="ru")
File "F:\whisper\virtual\lib\site-packages\subtoaudio\subtoaudio.py", line 120, in convert_to_audio
tts_method(f"{entry_data['text']}",file_path=audio_path,**convert_param,**kwargs)
File "F:\whisper\virtual\lib\site-packages\TTS\api.py", line 432, in tts_to_file
wav = self.tts(
File "F:\whisper\virtual\lib\site-packages\TTS\api.py", line 364, in tts
wav = self.synthesizer.tts(
File "F:\whisper\virtual\lib\site-packages\TTS\utils\synthesizer.py", line 383, in tts
outputs = self.tts_model.synthesize(
File "F:\whisper\virtual\lib\site-packages\TTS\tts\models\xtts.py", line 397, in synthesize
return self.inference_with_config(text, config, ref_audio_path=speaker_wav, language=language, **kwargs)
File "F:\whisper\virtual\lib\site-packages\TTS\tts\models\xtts.py", line 419, in inference_with_config
return self.full_inference(text, ref_audio_path, language, **settings)
File "F:\whisper\virtual\lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "F:\whisper\virtual\lib\site-packages\TTS\tts\models\xtts.py", line 480, in full_inference
(gpt_cond_latent, speaker_embedding) = self.get_conditioning_latents(
File "F:\whisper\virtual\lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "F:\whisper\virtual\lib\site-packages\TTS\tts\models\xtts.py", line 356, in get_conditioning_latents
audio = load_audio(file_path, load_sr)
File "F:\whisper\virtual\lib\site-packages\TTS\tts\models\xtts.py", line 72, in load_audio
audio, lsr = torchaudio.load(audiopath)
File "F:\whisper\virtual\lib\site-packages\torchaudio_backend\utils.py", line 204, in load
return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
File "F:\whisper\virtual\lib\site-packages\torchaudio_backend\soundfile.py", line 27, in load
return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
File "F:\whisper\virtual\lib\site-packages\torchaudio_backend\soundfile_backend.py", line 221, in load
with soundfile.SoundFile(filepath, "r") as file_:
File "F:\whisper\virtual\lib\site-packages\soundfile.py", line 658, in init
self._file = self._open(file, mode_int, closefd)
File "F:\whisper\virtual\lib\site-packages\soundfile.py", line 1212, in _open
raise TypeError("Invalid file: {0!r}".format(self.name))
TypeError: Invalid file: None