Rewrite script in Python
This commit is contained in:
parent
29c7b3e72f
commit
4b94f37e23
25
README.md
25
README.md
|
@ -1,9 +1,28 @@
|
||||||
# fsub
|
# fsub
|
||||||
`fsub` is a very simple script (less than 60 lines of code) for cleaning a .srt file
|
`fsub` is a Python script for cleaning, editing and fixing a SubRip (.srt) file
|
||||||
|
|
||||||
# Usage
|
# Usage
|
||||||
`fsub <file>`
|
```
|
||||||
|
usage: fsub [-h] [-c] [-s MS] [-n] file [file ...]
|
||||||
|
|
||||||
|
Fix, edit and clean SubRip (.srt) files.
|
||||||
|
|
||||||
|
positional arguments:
|
||||||
|
file list of input files (they all must be SubRip files)
|
||||||
|
|
||||||
|
optional arguments:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-c, --clean removes subtitles matching regular expressions listed in ~/.config/fsubrc
|
||||||
|
(this is the default behavior if no other flag is passed)
|
||||||
|
-s MS, --shift MS shifts all subtitles by MS milliseconds, which may be positive or
|
||||||
|
negative
|
||||||
|
-n, --no-html strips HTML tags from subtitles content
|
||||||
|
```
|
||||||
|
|
||||||
# Features
|
# Features
|
||||||
- Fixes subtitle numbering
|
- Fixes subtitle numbering
|
||||||
- Removes lines which have words listed in `~/.config/fsubrc`
|
- Converts files to UTF-8 encoding
|
||||||
|
- Validates file structure
|
||||||
|
- May remove subtitles containing lines that match any regular expression listed in `~/.config/fsubrc`
|
||||||
|
- May shift the time of all subtitles
|
||||||
|
- May strip HTML
|
||||||
|
|
259
fsub
259
fsub
|
@ -1,59 +1,218 @@
|
||||||
#!/bin/sh
|
#!/bin/python
|
||||||
set -e
|
import sys
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import chardet
|
||||||
|
import os
|
||||||
|
|
||||||
FSUBRC=~/.config/fsubrc
|
|
||||||
|
|
||||||
usage() {
|
class Time:
|
||||||
echo "usage: fsub <files>"
|
def __init__(self, time_str, file_name, line_number):
|
||||||
echo "fsub expects $FSUBRC to have a blacklist of words"
|
parsed_time = time_str.split(':')
|
||||||
exit 1
|
try:
|
||||||
}
|
h = int(parsed_time[0])
|
||||||
|
m = int(parsed_time[1])
|
||||||
|
ms = int(parsed_time[2].replace(',', ''))
|
||||||
|
# self.time: time in milliseconds
|
||||||
|
self.time = h * 3600000 + m * 60000 + ms
|
||||||
|
except Exception:
|
||||||
|
print('Invalid time format detected ({}:{})'
|
||||||
|
.format(file_name, line_number),
|
||||||
|
file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
[ -z "$1" ] && usage
|
def add(self, ms):
|
||||||
|
self.time += ms
|
||||||
|
|
||||||
for arg in "$@"; do
|
def __repr__(self):
|
||||||
case "$arg" in
|
ms = self.time % 1000
|
||||||
*.srt )
|
s = (self.time % 60000) / 1000
|
||||||
if [ ! -f "$arg" ]; then
|
m = (self.time / 60000) % 60
|
||||||
echo "$arg is not a file"
|
h = self.time / 3600000
|
||||||
usage
|
return '%02d:%02d:%02d,%03d' % (h, m, s, ms)
|
||||||
fi ;;
|
|
||||||
*) echo "$arg is not a .srt file"; usage ;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
[ -f "$FSUBRC" ] || touch $FSUBRC
|
|
||||||
|
|
||||||
for arg in "$@"; do
|
class Subtitle:
|
||||||
awk '
|
def __init__(self, lines, file_name, line_number):
|
||||||
BEGIN {
|
try:
|
||||||
n = 1
|
# This is mostly ignored, as the subtitles are renumbered later
|
||||||
i = 0
|
self.number = int(lines.pop(0))
|
||||||
while(getline < "'$FSUBRC'") {
|
except Exception:
|
||||||
blacklist[i] = $0
|
print('Invalid line number detected ({}:{})'
|
||||||
i++
|
.format(file_name, line_number),
|
||||||
}
|
file=sys.stderr)
|
||||||
}
|
sys.exit(1)
|
||||||
/^[[:digit:]]+[[:space:]]*$/ {
|
|
||||||
getline
|
|
||||||
time = $0
|
|
||||||
|
|
||||||
linen = 0
|
line_number += 1
|
||||||
while(getline) {
|
|
||||||
lines[linen] = $0
|
|
||||||
linen++
|
|
||||||
if($0 ~ /^[[:space:]]*$/) break
|
|
||||||
}
|
|
||||||
|
|
||||||
for(j = 0; j < i; j++)
|
try:
|
||||||
for(k = 0; k < linen; k++)
|
time_span = lines.pop(0).split(' --> ')
|
||||||
if(lines[k] ~ blacklist[j]) next
|
|
||||||
|
|
||||||
print n
|
self.time_start = Time(time_span[0], file_name, line_number)
|
||||||
n++
|
self.time_end = Time(time_span[1], file_name, line_number)
|
||||||
print time
|
except Exception:
|
||||||
for(j = 0; j < linen; j++)
|
print('Invalid time span format detected ({}:{})'
|
||||||
print lines[j]
|
.format(file_name, line_number),
|
||||||
}' "$arg" | sed 's/\r//' > /tmp/fsub
|
file=sys.stderr)
|
||||||
mv /tmp/fsub "$arg"
|
sys.exit(1)
|
||||||
done
|
|
||||||
|
self.content = lines
|
||||||
|
|
||||||
|
def shift(self, ms):
|
||||||
|
self.time_start.add(ms)
|
||||||
|
self.time_end.add(ms)
|
||||||
|
|
||||||
|
def matches(self, regexp):
|
||||||
|
for line in self.content:
|
||||||
|
if regexp.findall(line):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '{}\n{} --> {}\n{}'.format(
|
||||||
|
self.number,
|
||||||
|
self.time_start, self.time_end,
|
||||||
|
os.linesep.join(self.content)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def clean(subs):
|
||||||
|
# Read expressions in ~/.config/fsubrc
|
||||||
|
fsubrc = open(os.getenv('HOME') + '/.config/fsubrc', 'r')
|
||||||
|
lines = re.split(r'\r?\n', fsubrc.read().strip())
|
||||||
|
expressions = list(map(re.compile, lines))
|
||||||
|
fsubrc.close()
|
||||||
|
|
||||||
|
# Cancel if no expression
|
||||||
|
if len(expressions) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Remove lines matching any expression
|
||||||
|
for regexp in expressions:
|
||||||
|
subs = filter(lambda sub: not sub.matches(regexp), subs)
|
||||||
|
|
||||||
|
return list(subs)
|
||||||
|
|
||||||
|
|
||||||
|
def shift(subs, ms):
|
||||||
|
for sub in subs:
|
||||||
|
sub.shift(ms)
|
||||||
|
return list(filter(lambda sub: sub.time_start.time >= 0, subs))
|
||||||
|
|
||||||
|
|
||||||
|
def strip_html(subs):
|
||||||
|
for sub in subs:
|
||||||
|
for i in range(0, len(sub.content)):
|
||||||
|
sub.content[i] = re.sub('<.+>', '', sub.content[i])
|
||||||
|
|
||||||
|
|
||||||
|
def process_file(args, file):
|
||||||
|
# Read the input file
|
||||||
|
contents = file.read()
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
# Decode the file contents
|
||||||
|
encoding = chardet.detect(contents)['encoding']
|
||||||
|
if encoding is None:
|
||||||
|
print('Corrupt or empty file ({})'.format(file.name),
|
||||||
|
file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
contents = contents.decode(encoding)
|
||||||
|
|
||||||
|
# Count empty lines at the beginning
|
||||||
|
r = re.compile(r'\r?\n')
|
||||||
|
line_number = 1
|
||||||
|
for line in r.split(contents):
|
||||||
|
if len(line) == 0 or line.isspace():
|
||||||
|
line_number += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Split subtitles on empty lines
|
||||||
|
subs = re.split(r'(?:\r?\n){2}', contents.strip())
|
||||||
|
|
||||||
|
# Create Subtitle objects
|
||||||
|
subs_objs = []
|
||||||
|
for sub in subs:
|
||||||
|
lines = list(r.split(sub))
|
||||||
|
subs_objs.append(Subtitle(lines, file.name, line_number))
|
||||||
|
line_number += len(lines) + 3
|
||||||
|
|
||||||
|
# Clean if --clean is passed
|
||||||
|
if args.clean:
|
||||||
|
subs_objs = clean(subs_objs)
|
||||||
|
|
||||||
|
# Shift if --shift is passed
|
||||||
|
if args.shift:
|
||||||
|
subs_objs = shift(subs_objs, args.shift)
|
||||||
|
|
||||||
|
# Strip HTML if --no-html is passed
|
||||||
|
if args.no_html:
|
||||||
|
strip_html(subs_objs)
|
||||||
|
|
||||||
|
# Fix numbering
|
||||||
|
i = 1
|
||||||
|
for sub in subs_objs:
|
||||||
|
sub.number = i
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Join Subtitle objects back to a string
|
||||||
|
contents = (os.linesep + os.linesep).join(map(repr, subs_objs))
|
||||||
|
|
||||||
|
# Write output
|
||||||
|
output = open(file.name, 'w', encoding='utf-8')
|
||||||
|
output.write(contents)
|
||||||
|
output.write(os.linesep)
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Fix, edit and clean SubRip (.srt) files.',
|
||||||
|
add_help=True
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'-c', '--clean',
|
||||||
|
help='removes subtitles matching regular expressions ' +
|
||||||
|
'listed in ~/.config/fsubrc (this is the default ' +
|
||||||
|
'behavior if no other flag is passed)',
|
||||||
|
action='store_true'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'-s', '--shift',
|
||||||
|
help='shifts all subtitles by MS milliseconds, which ' +
|
||||||
|
'may be positive or negative',
|
||||||
|
metavar='MS',
|
||||||
|
action='store',
|
||||||
|
type=int
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'-n', '--no-html',
|
||||||
|
help='strips HTML tags from subtitles content',
|
||||||
|
action='store_true'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'files',
|
||||||
|
help='list of input files (they all must be SubRip files)',
|
||||||
|
metavar='file',
|
||||||
|
type=argparse.FileType('rb+'),
|
||||||
|
nargs='+'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Make sure --clean is the default
|
||||||
|
if not args.shift and not args.no_html:
|
||||||
|
args.clean = True
|
||||||
|
|
||||||
|
# Check if all files are .srt
|
||||||
|
for file in args.files:
|
||||||
|
if file.name[-4:] != '.srt':
|
||||||
|
print('File {} is not a SubRip file'.format(file.name),
|
||||||
|
file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
for file in args.files:
|
||||||
|
process_file(args, file)
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
chardet
|
Loading…
Reference in New Issue