Rewrite script in Python
This commit is contained in:
parent
29c7b3e72f
commit
4b94f37e23
25
README.md
25
README.md
|
@ -1,9 +1,28 @@
|
|||
# fsub
|
||||
`fsub` is a very simple script (less than 60 lines of code) for cleaning a .srt file
|
||||
`fsub` is a Python script for cleaning, editing and fixing a SubRip (.srt) file
|
||||
|
||||
# Usage
|
||||
`fsub <file>`
|
||||
```
|
||||
usage: fsub [-h] [-c] [-s MS] [-n] file [file ...]
|
||||
|
||||
Fix, edit and clean SubRip (.srt) files.
|
||||
|
||||
positional arguments:
|
||||
file list of input files (they all must be SubRip files)
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-c, --clean removes subtitles matching regular expressions listed in ~/.config/fsubrc
|
||||
(this is the default behavior if no other flag is passed)
|
||||
-s MS, --shift MS shifts all subtitles by MS milliseconds, which may be positive or
|
||||
negative
|
||||
-n, --no-html strips HTML tags from subtitles content
|
||||
```
|
||||
|
||||
# Features
|
||||
- Fixes subtitle numbering
|
||||
- Removes lines which have words listed in `~/.config/fsubrc`
|
||||
- Converts files to UTF-8 encoding
|
||||
- Validates file structure
|
||||
- May remove subtitles containing lines that match any regular expression listed in `~/.config/fsubrc`
|
||||
- May shift the time of all subtitles
|
||||
- May strip HTML
|
||||
|
|
259
fsub
259
fsub
|
@ -1,59 +1,218 @@
|
|||
#!/bin/sh
|
||||
set -e
|
||||
#!/bin/python
|
||||
import sys
|
||||
import argparse
|
||||
import re
|
||||
import chardet
|
||||
import os
|
||||
|
||||
FSUBRC=~/.config/fsubrc
|
||||
|
||||
usage() {
|
||||
echo "usage: fsub <files>"
|
||||
echo "fsub expects $FSUBRC to have a blacklist of words"
|
||||
exit 1
|
||||
}
|
||||
class Time:
|
||||
def __init__(self, time_str, file_name, line_number):
|
||||
parsed_time = time_str.split(':')
|
||||
try:
|
||||
h = int(parsed_time[0])
|
||||
m = int(parsed_time[1])
|
||||
ms = int(parsed_time[2].replace(',', ''))
|
||||
# self.time: time in milliseconds
|
||||
self.time = h * 3600000 + m * 60000 + ms
|
||||
except Exception:
|
||||
print('Invalid time format detected ({}:{})'
|
||||
.format(file_name, line_number),
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
[ -z "$1" ] && usage
|
||||
def add(self, ms):
|
||||
self.time += ms
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
*.srt )
|
||||
if [ ! -f "$arg" ]; then
|
||||
echo "$arg is not a file"
|
||||
usage
|
||||
fi ;;
|
||||
*) echo "$arg is not a .srt file"; usage ;;
|
||||
esac
|
||||
done
|
||||
def __repr__(self):
|
||||
ms = self.time % 1000
|
||||
s = (self.time % 60000) / 1000
|
||||
m = (self.time / 60000) % 60
|
||||
h = self.time / 3600000
|
||||
return '%02d:%02d:%02d,%03d' % (h, m, s, ms)
|
||||
|
||||
[ -f "$FSUBRC" ] || touch $FSUBRC
|
||||
|
||||
for arg in "$@"; do
|
||||
awk '
|
||||
BEGIN {
|
||||
n = 1
|
||||
i = 0
|
||||
while(getline < "'$FSUBRC'") {
|
||||
blacklist[i] = $0
|
||||
i++
|
||||
}
|
||||
}
|
||||
/^[[:digit:]]+[[:space:]]*$/ {
|
||||
getline
|
||||
time = $0
|
||||
class Subtitle:
|
||||
def __init__(self, lines, file_name, line_number):
|
||||
try:
|
||||
# This is mostly ignored, as the subtitles are renumbered later
|
||||
self.number = int(lines.pop(0))
|
||||
except Exception:
|
||||
print('Invalid line number detected ({}:{})'
|
||||
.format(file_name, line_number),
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
linen = 0
|
||||
while(getline) {
|
||||
lines[linen] = $0
|
||||
linen++
|
||||
if($0 ~ /^[[:space:]]*$/) break
|
||||
}
|
||||
line_number += 1
|
||||
|
||||
for(j = 0; j < i; j++)
|
||||
for(k = 0; k < linen; k++)
|
||||
if(lines[k] ~ blacklist[j]) next
|
||||
try:
|
||||
time_span = lines.pop(0).split(' --> ')
|
||||
|
||||
print n
|
||||
n++
|
||||
print time
|
||||
for(j = 0; j < linen; j++)
|
||||
print lines[j]
|
||||
}' "$arg" | sed 's/\r//' > /tmp/fsub
|
||||
mv /tmp/fsub "$arg"
|
||||
done
|
||||
self.time_start = Time(time_span[0], file_name, line_number)
|
||||
self.time_end = Time(time_span[1], file_name, line_number)
|
||||
except Exception:
|
||||
print('Invalid time span format detected ({}:{})'
|
||||
.format(file_name, line_number),
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
self.content = lines
|
||||
|
||||
def shift(self, ms):
|
||||
self.time_start.add(ms)
|
||||
self.time_end.add(ms)
|
||||
|
||||
def matches(self, regexp):
|
||||
for line in self.content:
|
||||
if regexp.findall(line):
|
||||
return True
|
||||
return False
|
||||
|
||||
def __repr__(self):
|
||||
return '{}\n{} --> {}\n{}'.format(
|
||||
self.number,
|
||||
self.time_start, self.time_end,
|
||||
os.linesep.join(self.content)
|
||||
)
|
||||
|
||||
|
||||
def clean(subs):
|
||||
# Read expressions in ~/.config/fsubrc
|
||||
fsubrc = open(os.getenv('HOME') + '/.config/fsubrc', 'r')
|
||||
lines = re.split(r'\r?\n', fsubrc.read().strip())
|
||||
expressions = list(map(re.compile, lines))
|
||||
fsubrc.close()
|
||||
|
||||
# Cancel if no expression
|
||||
if len(expressions) == 0:
|
||||
return
|
||||
|
||||
# Remove lines matching any expression
|
||||
for regexp in expressions:
|
||||
subs = filter(lambda sub: not sub.matches(regexp), subs)
|
||||
|
||||
return list(subs)
|
||||
|
||||
|
||||
def shift(subs, ms):
|
||||
for sub in subs:
|
||||
sub.shift(ms)
|
||||
return list(filter(lambda sub: sub.time_start.time >= 0, subs))
|
||||
|
||||
|
||||
def strip_html(subs):
|
||||
for sub in subs:
|
||||
for i in range(0, len(sub.content)):
|
||||
sub.content[i] = re.sub('<.+>', '', sub.content[i])
|
||||
|
||||
|
||||
def process_file(args, file):
|
||||
# Read the input file
|
||||
contents = file.read()
|
||||
file.close()
|
||||
|
||||
# Decode the file contents
|
||||
encoding = chardet.detect(contents)['encoding']
|
||||
if encoding is None:
|
||||
print('Corrupt or empty file ({})'.format(file.name),
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
contents = contents.decode(encoding)
|
||||
|
||||
# Count empty lines at the beginning
|
||||
r = re.compile(r'\r?\n')
|
||||
line_number = 1
|
||||
for line in r.split(contents):
|
||||
if len(line) == 0 or line.isspace():
|
||||
line_number += 1
|
||||
else:
|
||||
break
|
||||
|
||||
# Split subtitles on empty lines
|
||||
subs = re.split(r'(?:\r?\n){2}', contents.strip())
|
||||
|
||||
# Create Subtitle objects
|
||||
subs_objs = []
|
||||
for sub in subs:
|
||||
lines = list(r.split(sub))
|
||||
subs_objs.append(Subtitle(lines, file.name, line_number))
|
||||
line_number += len(lines) + 3
|
||||
|
||||
# Clean if --clean is passed
|
||||
if args.clean:
|
||||
subs_objs = clean(subs_objs)
|
||||
|
||||
# Shift if --shift is passed
|
||||
if args.shift:
|
||||
subs_objs = shift(subs_objs, args.shift)
|
||||
|
||||
# Strip HTML if --no-html is passed
|
||||
if args.no_html:
|
||||
strip_html(subs_objs)
|
||||
|
||||
# Fix numbering
|
||||
i = 1
|
||||
for sub in subs_objs:
|
||||
sub.number = i
|
||||
i += 1
|
||||
|
||||
# Join Subtitle objects back to a string
|
||||
contents = (os.linesep + os.linesep).join(map(repr, subs_objs))
|
||||
|
||||
# Write output
|
||||
output = open(file.name, 'w', encoding='utf-8')
|
||||
output.write(contents)
|
||||
output.write(os.linesep)
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Fix, edit and clean SubRip (.srt) files.',
|
||||
add_help=True
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-c', '--clean',
|
||||
help='removes subtitles matching regular expressions ' +
|
||||
'listed in ~/.config/fsubrc (this is the default ' +
|
||||
'behavior if no other flag is passed)',
|
||||
action='store_true'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-s', '--shift',
|
||||
help='shifts all subtitles by MS milliseconds, which ' +
|
||||
'may be positive or negative',
|
||||
metavar='MS',
|
||||
action='store',
|
||||
type=int
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-n', '--no-html',
|
||||
help='strips HTML tags from subtitles content',
|
||||
action='store_true'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'files',
|
||||
help='list of input files (they all must be SubRip files)',
|
||||
metavar='file',
|
||||
type=argparse.FileType('rb+'),
|
||||
nargs='+'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Make sure --clean is the default
|
||||
if not args.shift and not args.no_html:
|
||||
args.clean = True
|
||||
|
||||
# Check if all files are .srt
|
||||
for file in args.files:
|
||||
if file.name[-4:] != '.srt':
|
||||
print('File {} is not a SubRip file'.format(file.name),
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
for file in args.files:
|
||||
process_file(args, file)
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
chardet
|
Loading…
Reference in New Issue