Rewrite script in Python

2021-11-14 15:49:35 -03:00 · 2021-11-14 15:49:35 -03:00 · 4b94f37e23
parent 29c7b3e72f
commit 4b94f37e23
3 changed files with 232 additions and 53 deletions
--- a/README.md
+++ b/README.md
@ -1,9 +1,28 @@
 # fsub
-`fsub` is a very simple script (less than 60 lines of code) for cleaning a .srt file
+`fsub` is a Python script for cleaning, editing and fixing a SubRip (.srt) file

 # Usage
-`fsub <file>`
+```
+usage: fsub [-h] [-c] [-s MS] [-n] file [file ...]
+
+Fix, edit and clean SubRip (.srt) files.
+
+positional arguments:
+  file               list of input files (they all must be SubRip files)
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -c, --clean        removes subtitles matching regular expressions listed in ~/.config/fsubrc
+                     (this is the default behavior if no other flag is passed)
+  -s MS, --shift MS  shifts all subtitles by MS milliseconds, which may be positive or
+                     negative
+  -n, --no-html      strips HTML tags from subtitles content
+```

 # Features
 - Fixes subtitle numbering
- Removes lines which have words listed in `~/.config/fsubrc`
+- Converts files to UTF-8 encoding
+- Validates file structure
+- May remove subtitles containing lines that match any regular expression listed in `~/.config/fsubrc`
+- May shift the time of all subtitles
+- May strip HTML
--- a/259
+++ b/259
@ -1,59 +1,218 @@
-#!/bin/sh
-set -e
+#!/bin/python
+import sys
+import argparse
+import re
+import chardet
+import os

-FSUBRC=~/.config/fsubrc

-usage() {
-	echo "usage: fsub <files>"
-	echo "fsub expects $FSUBRC to have a blacklist of words"
-	exit 1
-}
+class Time:
+    def __init__(self, time_str, file_name, line_number):
+        parsed_time = time_str.split(':')
+        try:
+            h = int(parsed_time[0])
+            m = int(parsed_time[1])
+            ms = int(parsed_time[2].replace(',', ''))
+            # self.time: time in milliseconds
+            self.time = h * 3600000 + m * 60000 + ms
+        except Exception:
+            print('Invalid time format detected ({}:{})'
+                  .format(file_name, line_number),
+                  file=sys.stderr)
+            sys.exit(1)

-[ -z "$1" ] && usage
+    def add(self, ms):
+        self.time += ms

-for arg in "$@"; do
-	case "$arg" in
-		*.srt ) 
-			if [ ! -f "$arg" ]; then
-				echo "$arg is not a file"
-				usage
-			fi ;;
-		*) echo "$arg is not a .srt file"; usage ;;
-	esac
-done
+    def __repr__(self):
+        ms = self.time % 1000
+        s = (self.time % 60000) / 1000
+        m = (self.time / 60000) % 60
+        h = self.time / 3600000
+        return '%02d:%02d:%02d,%03d' % (h, m, s, ms)

-[ -f "$FSUBRC" ] || touch $FSUBRC

-for arg in "$@"; do
-	awk '
-	BEGIN {
-		n = 1
-		i = 0
-		while(getline < "'$FSUBRC'") {
-			blacklist[i] = $0
-			i++
-		}
-	}
-	/^[[:digit:]]+[[:space:]]*$/ { 
-		getline
-		time = $0
+class Subtitle:
+    def __init__(self, lines, file_name, line_number):
+        try:
+            # This is mostly ignored, as the subtitles are renumbered later
+            self.number = int(lines.pop(0))
+        except Exception:
+            print('Invalid line number detected ({}:{})'
+                  .format(file_name, line_number),
+                  file=sys.stderr)
+            sys.exit(1)

-		linen = 0
-		while(getline) {
-			lines[linen] = $0
-			linen++
-			if($0 ~ /^[[:space:]]*$/) break
-		}
+        line_number += 1

-		for(j = 0; j < i; j++)
-			for(k = 0; k < linen; k++)
-				if(lines[k] ~ blacklist[j]) next
+        try:
+            time_span = lines.pop(0).split(' --> ')

-		print n
-		n++
-		print time
-		for(j = 0; j < linen; j++)
-			print lines[j]
-	}' "$arg" | sed 's/\r//' > /tmp/fsub
-	mv /tmp/fsub "$arg"
-done
+            self.time_start = Time(time_span[0], file_name, line_number)
+            self.time_end = Time(time_span[1], file_name, line_number)
+        except Exception:
+            print('Invalid time span format detected ({}:{})'
+                  .format(file_name, line_number),
+                  file=sys.stderr)
+            sys.exit(1)
+
+        self.content = lines
+
+    def shift(self, ms):
+        self.time_start.add(ms)
+        self.time_end.add(ms)
+
+    def matches(self, regexp):
+        for line in self.content:
+            if regexp.findall(line):
+                return True
+        return False
+
+    def __repr__(self):
+        return '{}\n{} --> {}\n{}'.format(
+                self.number,
+                self.time_start, self.time_end,
+                os.linesep.join(self.content)
+        )
+
+
+def clean(subs):
+    # Read expressions in ~/.config/fsubrc
+    fsubrc = open(os.getenv('HOME') + '/.config/fsubrc', 'r')
+    lines = re.split(r'\r?\n', fsubrc.read().strip())
+    expressions = list(map(re.compile, lines))
+    fsubrc.close()
+
+    # Cancel if no expression
+    if len(expressions) == 0:
+        return
+
+    # Remove lines matching any expression
+    for regexp in expressions:
+        subs = filter(lambda sub: not sub.matches(regexp), subs)
+
+    return list(subs)
+
+
+def shift(subs, ms):
+    for sub in subs:
+        sub.shift(ms)
+    return list(filter(lambda sub: sub.time_start.time >= 0, subs))
+
+
+def strip_html(subs):
+    for sub in subs:
+        for i in range(0, len(sub.content)):
+            sub.content[i] = re.sub('<.+>', '', sub.content[i])
+
+
+def process_file(args, file):
+    # Read the input file
+    contents = file.read()
+    file.close()
+
+    # Decode the file contents
+    encoding = chardet.detect(contents)['encoding']
+    if encoding is None:
+        print('Corrupt or empty file ({})'.format(file.name),
+              file=sys.stderr)
+        sys.exit(1)
+    contents = contents.decode(encoding)
+
+    # Count empty lines at the beginning
+    r = re.compile(r'\r?\n')
+    line_number = 1
+    for line in r.split(contents):
+        if len(line) == 0 or line.isspace():
+            line_number += 1
+        else:
+            break
+
+    # Split subtitles on empty lines
+    subs = re.split(r'(?:\r?\n){2}', contents.strip())
+
+    # Create Subtitle objects
+    subs_objs = []
+    for sub in subs:
+        lines = list(r.split(sub))
+        subs_objs.append(Subtitle(lines, file.name, line_number))
+        line_number += len(lines) + 3
+
+    # Clean if --clean is passed
+    if args.clean:
+        subs_objs = clean(subs_objs)
+
+    # Shift if --shift is passed
+    if args.shift:
+        subs_objs = shift(subs_objs, args.shift)
+
+    # Strip HTML if --no-html is passed
+    if args.no_html:
+        strip_html(subs_objs)
+
+    # Fix numbering
+    i = 1
+    for sub in subs_objs:
+        sub.number = i
+        i += 1
+
+    # Join Subtitle objects back to a string
+    contents = (os.linesep + os.linesep).join(map(repr, subs_objs))
+
+    # Write output
+    output = open(file.name, 'w', encoding='utf-8')
+    output.write(contents)
+    output.write(os.linesep)
+
+
+parser = argparse.ArgumentParser(
+    description='Fix, edit and clean SubRip (.srt) files.',
+    add_help=True
+)
+
+parser.add_argument(
+    '-c', '--clean',
+    help='removes subtitles matching regular expressions ' +
+         'listed in ~/.config/fsubrc (this is the default ' +
+         'behavior if no other flag is passed)',
+    action='store_true'
+)
+
+parser.add_argument(
+    '-s', '--shift',
+    help='shifts all subtitles by MS milliseconds, which ' +
+         'may be positive or negative',
+    metavar='MS',
+    action='store',
+    type=int
+)
+
+parser.add_argument(
+    '-n', '--no-html',
+    help='strips HTML tags from subtitles content',
+    action='store_true'
+)
+
+parser.add_argument(
+    'files',
+    help='list of input files (they all must be SubRip files)',
+    metavar='file',
+    type=argparse.FileType('rb+'),
+    nargs='+'
+)
+
+args = parser.parse_args()
+
+# Make sure --clean is the default
+if not args.shift and not args.no_html:
+    args.clean = True
+
+# Check if all files are .srt
+for file in args.files:
+    if file.name[-4:] != '.srt':
+        print('File {} is not a SubRip file'.format(file.name),
+              file=sys.stderr)
+        sys.exit(1)
+
+for file in args.files:
+    process_file(args, file)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
+chardet