Rewrite script in Python

2021-11-14 15:49:35 -03:00
parent 29c7b3e72f
commit 4b94f37e23
3 changed files with 232 additions and 53 deletions
--- a/README.md
+++ b/README.md
@@ -1,9 +1,28 @@
 # fsub
-`fsub` is a very simple script (less than 60 lines of code) for cleaning a .srt file
+`fsub` is a Python script for cleaning, editing and fixing a SubRip (.srt) file
 # Usage
-`fsub <file>`
+```
 usage: fsub [-h] [-c] [-s MS] [-n] file [file ...]
 Fix, edit and clean SubRip (.srt) files.
 positional arguments:
  file               list of input files (they all must be SubRip files)
 optional arguments:
  -h, --help         show this help message and exit
  -c, --clean        removes subtitles matching regular expressions listed in ~/.config/fsubrc
                     (this is the default behavior if no other flag is passed)
  -s MS, --shift MS  shifts all subtitles by MS milliseconds, which may be positive or
                     negative
  -n, --no-html      strips HTML tags from subtitles content
 ```
 # Features
 - Fixes subtitle numbering
- Removes lines which have words listed in `~/.config/fsubrc`
+- Converts files to UTF-8 encoding
 - Validates file structure
 - May remove subtitles containing lines that match any regular expression listed in `~/.config/fsubrc`
 - May shift the time of all subtitles
 - May strip HTML
--- a/259
+++ b/259
@@ -1,59 +1,218 @@
-#!/bin/sh
+#!/bin/python
-set -e
+import sys
 import argparse
 import re
 import chardet
 import os
 FSUBRC=~/.config/fsubrc
-usage() {
+class Time:
-	echo "usage: fsub <files>"
+    def __init__(self, time_str, file_name, line_number):
-	echo "fsub expects $FSUBRC to have a blacklist of words"
+        parsed_time = time_str.split(':')
-	exit 1
+        try:
-}
+            h = int(parsed_time[0])
            m = int(parsed_time[1])
            ms = int(parsed_time[2].replace(',', ''))
            # self.time: time in milliseconds
            self.time = h * 3600000 + m * 60000 + ms
        except Exception:
            print('Invalid time format detected ({}:{})'
                  .format(file_name, line_number),
                  file=sys.stderr)
            sys.exit(1)
-[ -z "$1" ] && usage
+    def add(self, ms):
        self.time += ms
-for arg in "$@"; do
+    def __repr__(self):
-	case "$arg" in
+        ms = self.time % 1000
-		*.srt ) 
+        s = (self.time % 60000) / 1000
-			if [ ! -f "$arg" ]; then
+        m = (self.time / 60000) % 60
-				echo "$arg is not a file"
+        h = self.time / 3600000
-				usage
+        return '%02d:%02d:%02d,%03d' % (h, m, s, ms)
 			fi ;;
 		*) echo "$arg is not a .srt file"; usage ;;
 	esac
 done
 [ -f "$FSUBRC" ] || touch $FSUBRC
-for arg in "$@"; do
+class Subtitle:
-	awk '
+    def __init__(self, lines, file_name, line_number):
-	BEGIN {
+        try:
-		n = 1
+            # This is mostly ignored, as the subtitles are renumbered later
-		i = 0
+            self.number = int(lines.pop(0))
-		while(getline < "'$FSUBRC'") {
+        except Exception:
-			blacklist[i] = $0
+            print('Invalid line number detected ({}:{})'
-			i++
+                  .format(file_name, line_number),
-		}
+                  file=sys.stderr)
-	}
+            sys.exit(1)
 	/^[[:digit:]]+[[:space:]]*$/ { 
 		getline
 		time = $0
-		linen = 0
+        line_number += 1
 		while(getline) {
 			lines[linen] = $0
 			linen++
 			if($0 ~ /^[[:space:]]*$/) break
 		}
-		for(j = 0; j < i; j++)
+        try:
-			for(k = 0; k < linen; k++)
+            time_span = lines.pop(0).split(' --> ')
 				if(lines[k] ~ blacklist[j]) next
-		print n
+            self.time_start = Time(time_span[0], file_name, line_number)
-		n++
+            self.time_end = Time(time_span[1], file_name, line_number)
-		print time
+        except Exception:
-		for(j = 0; j < linen; j++)
+            print('Invalid time span format detected ({}:{})'
-			print lines[j]
+                  .format(file_name, line_number),
-	}' "$arg" | sed 's/\r//' > /tmp/fsub
+                  file=sys.stderr)
-	mv /tmp/fsub "$arg"
+            sys.exit(1)
-done
+
        self.content = lines
    def shift(self, ms):
        self.time_start.add(ms)
        self.time_end.add(ms)
    def matches(self, regexp):
        for line in self.content:
            if regexp.findall(line):
                return True
        return False
    def __repr__(self):
        return '{}\n{} --> {}\n{}'.format(
                self.number,
                self.time_start, self.time_end,
                os.linesep.join(self.content)
        )
 def clean(subs):
    # Read expressions in ~/.config/fsubrc
    fsubrc = open(os.getenv('HOME') + '/.config/fsubrc', 'r')
    lines = re.split(r'\r?\n', fsubrc.read().strip())
    expressions = list(map(re.compile, lines))
    fsubrc.close()
    # Cancel if no expression
    if len(expressions) == 0:
        return
    # Remove lines matching any expression
    for regexp in expressions:
        subs = filter(lambda sub: not sub.matches(regexp), subs)
    return list(subs)
 def shift(subs, ms):
    for sub in subs:
        sub.shift(ms)
    return list(filter(lambda sub: sub.time_start.time >= 0, subs))
 def strip_html(subs):
    for sub in subs:
        for i in range(0, len(sub.content)):
            sub.content[i] = re.sub('<.+>', '', sub.content[i])
 def process_file(args, file):
    # Read the input file
    contents = file.read()
    file.close()
    # Decode the file contents
    encoding = chardet.detect(contents)['encoding']
    if encoding is None:
        print('Corrupt or empty file ({})'.format(file.name),
              file=sys.stderr)
        sys.exit(1)
    contents = contents.decode(encoding)
    # Count empty lines at the beginning
    r = re.compile(r'\r?\n')
    line_number = 1
    for line in r.split(contents):
        if len(line) == 0 or line.isspace():
            line_number += 1
        else:
            break
    # Split subtitles on empty lines
    subs = re.split(r'(?:\r?\n){2}', contents.strip())
    # Create Subtitle objects
    subs_objs = []
    for sub in subs:
        lines = list(r.split(sub))
        subs_objs.append(Subtitle(lines, file.name, line_number))
        line_number += len(lines) + 3
    # Clean if --clean is passed
    if args.clean:
        subs_objs = clean(subs_objs)
    # Shift if --shift is passed
    if args.shift:
        subs_objs = shift(subs_objs, args.shift)
    # Strip HTML if --no-html is passed
    if args.no_html:
        strip_html(subs_objs)
    # Fix numbering
    i = 1
    for sub in subs_objs:
        sub.number = i
        i += 1
    # Join Subtitle objects back to a string
    contents = (os.linesep + os.linesep).join(map(repr, subs_objs))
    # Write output
    output = open(file.name, 'w', encoding='utf-8')
    output.write(contents)
    output.write(os.linesep)
 parser = argparse.ArgumentParser(
    description='Fix, edit and clean SubRip (.srt) files.',
    add_help=True
 )
 parser.add_argument(
    '-c', '--clean',
    help='removes subtitles matching regular expressions ' +
         'listed in ~/.config/fsubrc (this is the default ' +
         'behavior if no other flag is passed)',
    action='store_true'
 )
 parser.add_argument(
    '-s', '--shift',
    help='shifts all subtitles by MS milliseconds, which ' +
         'may be positive or negative',
    metavar='MS',
    action='store',
    type=int
 )
 parser.add_argument(
    '-n', '--no-html',
    help='strips HTML tags from subtitles content',
    action='store_true'
 )
 parser.add_argument(
    'files',
    help='list of input files (they all must be SubRip files)',
    metavar='file',
    type=argparse.FileType('rb+'),
    nargs='+'
 )
 args = parser.parse_args()
 # Make sure --clean is the default
 if not args.shift and not args.no_html:
    args.clean = True
 # Check if all files are .srt
 for file in args.files:
    if file.name[-4:] != '.srt':
        print('File {} is not a SubRip file'.format(file.name),
              file=sys.stderr)
        sys.exit(1)
 for file in args.files:
    process_file(args, file)
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1 @@
 chardet