$ alias subdl='subdl --utf8 --download=all --existing=bypass --force-filename --username=xx --password=xx'
$ subdl Irreversible.2002.mp4
Searching for subtitles for query=Irreversible.2002...
Found 24 results for 'Irreversible.2002.mp4':
#1952041941 [en] [Rat: 0.0 DL:70016] "Irreversible" Irreversible.2002.DVDRip.XviD.AC3-DK.EN.srt
[...]
Found encoding Johab with a confidence of 99.00%. Converting to utf8.
Downloading #1952041941 to Irreversible.2002.en.1952041941.srt... Traceback (most recent call last):
File ".subdl-wrapped", line 9, in <module>
sys.exit(cli())
File "subdl.py", line 534, in cli
main(sys.argv[1:])
File "subdl.py", line 504, in main
AutoDownloadAndSave(file, search_result, downloaded)
File "subdl.py", line 339, in AutoDownloadAndSave
DownloadAndSaveSubtitle(search_result.IDSubtitleFile, output_filename)
File "subdl.py", line 315, in DownloadAndSaveSubtitle
s = s.decode(result["encoding"]) # bytes -> str
UnicodeDecodeError: 'johab' codec can't decode byte 0xb4 in position 13235: illegal multibyte sequence
cd $(mktemp -d)
sub_id=3431287
wget https://dl.opensubtitles.org/en/download/sub/$sub_id
unzip -B $sub_id
chardetect *.srt
# Irreversible.2002.DVDRip.XviD.AC3-DK.EN.srt: Johab with confidence 0.99
iconv -f johab -t utf8 *.srt >/dev/null
# iconv: illegal input sequence at position 13187
dd if=$(ls *.srt) bs=1 skip=$((13187 - 8)) count=16 status=none | hexdump -C
# 00000000 79 21 0d 0a 2d 20 57 65 b4 72 65 20 67 6f 69 6e |y!..- We.re goin|
"johab" sounds weird. lets try latin1
iconv -f latin1 -t utf8 *.srt >/dev/null && echo ok
# ok
success! so its a bug in chardet ...
todo: workaround: when conversion to utf8 fails, keep the original file, show a warning, rename the result file to $basename.noutf8.$extension
, for example Irreversible.2002.en.1952041941.noutf8.srt