diff --git a/README.md b/README.md index 7865566..398c68d 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,28 @@ # fsub -`fsub` is a very simple script (less than 60 lines of code) for cleaning a .srt file +`fsub` is a Python script for cleaning, editing and fixing a SubRip (.srt) file # Usage -`fsub ` +``` +usage: fsub [-h] [-c] [-s MS] [-n] file [file ...] + +Fix, edit and clean SubRip (.srt) files. + +positional arguments: + file list of input files (they all must be SubRip files) + +optional arguments: + -h, --help show this help message and exit + -c, --clean removes subtitles matching regular expressions listed in ~/.config/fsubrc + (this is the default behavior if no other flag is passed) + -s MS, --shift MS shifts all subtitles by MS milliseconds, which may be positive or + negative + -n, --no-html strips HTML tags from subtitles content +``` # Features - Fixes subtitle numbering -- Removes lines which have words listed in `~/.config/fsubrc` +- Converts files to UTF-8 encoding +- Validates file structure +- May remove subtitles containing lines that match any regular expression listed in `~/.config/fsubrc` +- May shift the time of all subtitles +- May strip HTML diff --git a/fsub b/fsub index c347849..ecc2b60 100755 --- a/fsub +++ b/fsub @@ -1,59 +1,218 @@ -#!/bin/sh -set -e +#!/bin/python +import sys +import argparse +import re +import chardet +import os -FSUBRC=~/.config/fsubrc -usage() { - echo "usage: fsub " - echo "fsub expects $FSUBRC to have a blacklist of words" - exit 1 -} +class Time: + def __init__(self, time_str, file_name, line_number): + parsed_time = time_str.split(':') + try: + h = int(parsed_time[0]) + m = int(parsed_time[1]) + ms = int(parsed_time[2].replace(',', '')) + # self.time: time in milliseconds + self.time = h * 3600000 + m * 60000 + ms + except Exception: + print('Invalid time format detected ({}:{})' + .format(file_name, line_number), + file=sys.stderr) + sys.exit(1) -[ -z "$1" ] && usage + def add(self, ms): + self.time += ms -for arg in "$@"; do - case "$arg" in - *.srt ) - if [ ! -f "$arg" ]; then - echo "$arg is not a file" - usage - fi ;; - *) echo "$arg is not a .srt file"; usage ;; - esac -done + def __repr__(self): + ms = self.time % 1000 + s = (self.time % 60000) / 1000 + m = (self.time / 60000) % 60 + h = self.time / 3600000 + return '%02d:%02d:%02d,%03d' % (h, m, s, ms) -[ -f "$FSUBRC" ] || touch $FSUBRC -for arg in "$@"; do - awk ' - BEGIN { - n = 1 - i = 0 - while(getline < "'$FSUBRC'") { - blacklist[i] = $0 - i++ - } - } - /^[[:digit:]]+[[:space:]]*$/ { - getline - time = $0 +class Subtitle: + def __init__(self, lines, file_name, line_number): + try: + # This is mostly ignored, as the subtitles are renumbered later + self.number = int(lines.pop(0)) + except Exception: + print('Invalid line number detected ({}:{})' + .format(file_name, line_number), + file=sys.stderr) + sys.exit(1) - linen = 0 - while(getline) { - lines[linen] = $0 - linen++ - if($0 ~ /^[[:space:]]*$/) break - } + line_number += 1 - for(j = 0; j < i; j++) - for(k = 0; k < linen; k++) - if(lines[k] ~ blacklist[j]) next + try: + time_span = lines.pop(0).split(' --> ') - print n - n++ - print time - for(j = 0; j < linen; j++) - print lines[j] - }' "$arg" | sed 's/\r//' > /tmp/fsub - mv /tmp/fsub "$arg" -done + self.time_start = Time(time_span[0], file_name, line_number) + self.time_end = Time(time_span[1], file_name, line_number) + except Exception: + print('Invalid time span format detected ({}:{})' + .format(file_name, line_number), + file=sys.stderr) + sys.exit(1) + + self.content = lines + + def shift(self, ms): + self.time_start.add(ms) + self.time_end.add(ms) + + def matches(self, regexp): + for line in self.content: + if regexp.findall(line): + return True + return False + + def __repr__(self): + return '{}\n{} --> {}\n{}'.format( + self.number, + self.time_start, self.time_end, + os.linesep.join(self.content) + ) + + +def clean(subs): + # Read expressions in ~/.config/fsubrc + fsubrc = open(os.getenv('HOME') + '/.config/fsubrc', 'r') + lines = re.split(r'\r?\n', fsubrc.read().strip()) + expressions = list(map(re.compile, lines)) + fsubrc.close() + + # Cancel if no expression + if len(expressions) == 0: + return + + # Remove lines matching any expression + for regexp in expressions: + subs = filter(lambda sub: not sub.matches(regexp), subs) + + return list(subs) + + +def shift(subs, ms): + for sub in subs: + sub.shift(ms) + return list(filter(lambda sub: sub.time_start.time >= 0, subs)) + + +def strip_html(subs): + for sub in subs: + for i in range(0, len(sub.content)): + sub.content[i] = re.sub('<.+>', '', sub.content[i]) + + +def process_file(args, file): + # Read the input file + contents = file.read() + file.close() + + # Decode the file contents + encoding = chardet.detect(contents)['encoding'] + if encoding is None: + print('Corrupt or empty file ({})'.format(file.name), + file=sys.stderr) + sys.exit(1) + contents = contents.decode(encoding) + + # Count empty lines at the beginning + r = re.compile(r'\r?\n') + line_number = 1 + for line in r.split(contents): + if len(line) == 0 or line.isspace(): + line_number += 1 + else: + break + + # Split subtitles on empty lines + subs = re.split(r'(?:\r?\n){2}', contents.strip()) + + # Create Subtitle objects + subs_objs = [] + for sub in subs: + lines = list(r.split(sub)) + subs_objs.append(Subtitle(lines, file.name, line_number)) + line_number += len(lines) + 3 + + # Clean if --clean is passed + if args.clean: + subs_objs = clean(subs_objs) + + # Shift if --shift is passed + if args.shift: + subs_objs = shift(subs_objs, args.shift) + + # Strip HTML if --no-html is passed + if args.no_html: + strip_html(subs_objs) + + # Fix numbering + i = 1 + for sub in subs_objs: + sub.number = i + i += 1 + + # Join Subtitle objects back to a string + contents = (os.linesep + os.linesep).join(map(repr, subs_objs)) + + # Write output + output = open(file.name, 'w', encoding='utf-8') + output.write(contents) + output.write(os.linesep) + + +parser = argparse.ArgumentParser( + description='Fix, edit and clean SubRip (.srt) files.', + add_help=True +) + +parser.add_argument( + '-c', '--clean', + help='removes subtitles matching regular expressions ' + + 'listed in ~/.config/fsubrc (this is the default ' + + 'behavior if no other flag is passed)', + action='store_true' +) + +parser.add_argument( + '-s', '--shift', + help='shifts all subtitles by MS milliseconds, which ' + + 'may be positive or negative', + metavar='MS', + action='store', + type=int +) + +parser.add_argument( + '-n', '--no-html', + help='strips HTML tags from subtitles content', + action='store_true' +) + +parser.add_argument( + 'files', + help='list of input files (they all must be SubRip files)', + metavar='file', + type=argparse.FileType('rb+'), + nargs='+' +) + +args = parser.parse_args() + +# Make sure --clean is the default +if not args.shift and not args.no_html: + args.clean = True + +# Check if all files are .srt +for file in args.files: + if file.name[-4:] != '.srt': + print('File {} is not a SubRip file'.format(file.name), + file=sys.stderr) + sys.exit(1) + +for file in args.files: + process_file(args, file) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..79236f2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +chardet