Rewrite script in Python

This commit is contained in:
Augusto Gunsch 2021-11-14 15:49:35 -03:00
parent 29c7b3e72f
commit 4b94f37e23
No known key found for this signature in database
GPG Key ID: F7EEFE29825C72DC
3 changed files with 232 additions and 53 deletions

View File

@ -1,9 +1,28 @@
# fsub
`fsub` is a very simple script (less than 60 lines of code) for cleaning a .srt file
`fsub` is a Python script for cleaning, editing and fixing a SubRip (.srt) file
# Usage
`fsub <file>`
```
usage: fsub [-h] [-c] [-s MS] [-n] file [file ...]
Fix, edit and clean SubRip (.srt) files.
positional arguments:
file list of input files (they all must be SubRip files)
optional arguments:
-h, --help show this help message and exit
-c, --clean removes subtitles matching regular expressions listed in ~/.config/fsubrc
(this is the default behavior if no other flag is passed)
-s MS, --shift MS shifts all subtitles by MS milliseconds, which may be positive or
negative
-n, --no-html strips HTML tags from subtitles content
```
# Features
- Fixes subtitle numbering
- Removes lines which have words listed in `~/.config/fsubrc`
- Converts files to UTF-8 encoding
- Validates file structure
- May remove subtitles containing lines that match any regular expression listed in `~/.config/fsubrc`
- May shift the time of all subtitles
- May strip HTML

259
fsub
View File

@ -1,59 +1,218 @@
#!/bin/sh
set -e
#!/bin/python
import sys
import argparse
import re
import chardet
import os
FSUBRC=~/.config/fsubrc
usage() {
echo "usage: fsub <files>"
echo "fsub expects $FSUBRC to have a blacklist of words"
exit 1
}
class Time:
def __init__(self, time_str, file_name, line_number):
parsed_time = time_str.split(':')
try:
h = int(parsed_time[0])
m = int(parsed_time[1])
ms = int(parsed_time[2].replace(',', ''))
# self.time: time in milliseconds
self.time = h * 3600000 + m * 60000 + ms
except Exception:
print('Invalid time format detected ({}:{})'
.format(file_name, line_number),
file=sys.stderr)
sys.exit(1)
[ -z "$1" ] && usage
def add(self, ms):
self.time += ms
for arg in "$@"; do
case "$arg" in
*.srt )
if [ ! -f "$arg" ]; then
echo "$arg is not a file"
usage
fi ;;
*) echo "$arg is not a .srt file"; usage ;;
esac
done
def __repr__(self):
ms = self.time % 1000
s = (self.time % 60000) / 1000
m = (self.time / 60000) % 60
h = self.time / 3600000
return '%02d:%02d:%02d,%03d' % (h, m, s, ms)
[ -f "$FSUBRC" ] || touch $FSUBRC
for arg in "$@"; do
awk '
BEGIN {
n = 1
i = 0
while(getline < "'$FSUBRC'") {
blacklist[i] = $0
i++
}
}
/^[[:digit:]]+[[:space:]]*$/ {
getline
time = $0
class Subtitle:
def __init__(self, lines, file_name, line_number):
try:
# This is mostly ignored, as the subtitles are renumbered later
self.number = int(lines.pop(0))
except Exception:
print('Invalid line number detected ({}:{})'
.format(file_name, line_number),
file=sys.stderr)
sys.exit(1)
linen = 0
while(getline) {
lines[linen] = $0
linen++
if($0 ~ /^[[:space:]]*$/) break
}
line_number += 1
for(j = 0; j < i; j++)
for(k = 0; k < linen; k++)
if(lines[k] ~ blacklist[j]) next
try:
time_span = lines.pop(0).split(' --> ')
print n
n++
print time
for(j = 0; j < linen; j++)
print lines[j]
}' "$arg" | sed 's/\r//' > /tmp/fsub
mv /tmp/fsub "$arg"
done
self.time_start = Time(time_span[0], file_name, line_number)
self.time_end = Time(time_span[1], file_name, line_number)
except Exception:
print('Invalid time span format detected ({}:{})'
.format(file_name, line_number),
file=sys.stderr)
sys.exit(1)
self.content = lines
def shift(self, ms):
self.time_start.add(ms)
self.time_end.add(ms)
def matches(self, regexp):
for line in self.content:
if regexp.findall(line):
return True
return False
def __repr__(self):
return '{}\n{} --> {}\n{}'.format(
self.number,
self.time_start, self.time_end,
os.linesep.join(self.content)
)
def clean(subs):
# Read expressions in ~/.config/fsubrc
fsubrc = open(os.getenv('HOME') + '/.config/fsubrc', 'r')
lines = re.split(r'\r?\n', fsubrc.read().strip())
expressions = list(map(re.compile, lines))
fsubrc.close()
# Cancel if no expression
if len(expressions) == 0:
return
# Remove lines matching any expression
for regexp in expressions:
subs = filter(lambda sub: not sub.matches(regexp), subs)
return list(subs)
def shift(subs, ms):
for sub in subs:
sub.shift(ms)
return list(filter(lambda sub: sub.time_start.time >= 0, subs))
def strip_html(subs):
for sub in subs:
for i in range(0, len(sub.content)):
sub.content[i] = re.sub('<.+>', '', sub.content[i])
def process_file(args, file):
# Read the input file
contents = file.read()
file.close()
# Decode the file contents
encoding = chardet.detect(contents)['encoding']
if encoding is None:
print('Corrupt or empty file ({})'.format(file.name),
file=sys.stderr)
sys.exit(1)
contents = contents.decode(encoding)
# Count empty lines at the beginning
r = re.compile(r'\r?\n')
line_number = 1
for line in r.split(contents):
if len(line) == 0 or line.isspace():
line_number += 1
else:
break
# Split subtitles on empty lines
subs = re.split(r'(?:\r?\n){2}', contents.strip())
# Create Subtitle objects
subs_objs = []
for sub in subs:
lines = list(r.split(sub))
subs_objs.append(Subtitle(lines, file.name, line_number))
line_number += len(lines) + 3
# Clean if --clean is passed
if args.clean:
subs_objs = clean(subs_objs)
# Shift if --shift is passed
if args.shift:
subs_objs = shift(subs_objs, args.shift)
# Strip HTML if --no-html is passed
if args.no_html:
strip_html(subs_objs)
# Fix numbering
i = 1
for sub in subs_objs:
sub.number = i
i += 1
# Join Subtitle objects back to a string
contents = (os.linesep + os.linesep).join(map(repr, subs_objs))
# Write output
output = open(file.name, 'w', encoding='utf-8')
output.write(contents)
output.write(os.linesep)
parser = argparse.ArgumentParser(
description='Fix, edit and clean SubRip (.srt) files.',
add_help=True
)
parser.add_argument(
'-c', '--clean',
help='removes subtitles matching regular expressions ' +
'listed in ~/.config/fsubrc (this is the default ' +
'behavior if no other flag is passed)',
action='store_true'
)
parser.add_argument(
'-s', '--shift',
help='shifts all subtitles by MS milliseconds, which ' +
'may be positive or negative',
metavar='MS',
action='store',
type=int
)
parser.add_argument(
'-n', '--no-html',
help='strips HTML tags from subtitles content',
action='store_true'
)
parser.add_argument(
'files',
help='list of input files (they all must be SubRip files)',
metavar='file',
type=argparse.FileType('rb+'),
nargs='+'
)
args = parser.parse_args()
# Make sure --clean is the default
if not args.shift and not args.no_html:
args.clean = True
# Check if all files are .srt
for file in args.files:
if file.name[-4:] != '.srt':
print('File {} is not a SubRip file'.format(file.name),
file=sys.stderr)
sys.exit(1)
for file in args.files:
process_file(args, file)

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
chardet