8000 Merge pull request #52 from kkroening/split-silence · virustyle/ffmpeg-python@09e6e46 · GitHub
[go: up one dir, main page]

Skip to content

Commit 09e6e46

Browse files
authored
Merge pull request kkroening#52 from kkroening/split-silence
Add examples: `split_silence` + `transcribe`
2 parents 0672fd0 + 3a818cc commit 09e6e46

File tree

3 files changed

+205
-0
lines changed

3 files changed

+205
-0
lines changed

examples/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ffmpeg-python
2+
google-cloud-speech

examples/split_silence.py

Lines changed: 141 additions & 0 deletions
< F438 tr class="diff-line-row">
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/usr/bin/env python
2+
from __future__ import unicode_literals
3+
4+
import argparse
5+
import errno
6+
import ffmpeg
7+
import logging
8+
import os
9+
import re
10+
import subprocess
11+
import sys
12+
13+
14+
logging.basicConfig(level=logging.INFO, format='%(message)s')
15+
logger = logging.getLogger(__file__)
16+
logger.setLevel(logging.INFO)
17+
18+
DEFAULT_DURATION = 0.3
19+
DEFAULT_THRESHOLD = -60
20+
21+
parser = argparse.ArgumentParser(description='Split media into separate chunks wherever silence occurs')
22+
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
23+
parser.add_argument('out_pattern', help='Output filename pattern (e.g. `out/chunk_{:04d}.wav`)')
24+
parser.add_argument('--silence-threshold', default=DEFAULT_THRESHOLD, type=int, help='Silence threshold (in dB)')
25+
parser.add_argument('--silence-duration', default=DEFAULT_DURATION, type=float, help='Silence duration')
26+
parser.add_argument('--start-time', type=float, help='Start time (seconds)')
27+
parser.add_argument('--end-time', type=float, help='End time (seconds)')
28+
parser.add_argument('-v', dest='verbose', action='store_true', help='Verbose mode')
29+
30+
silence_start_re = re.compile(' silence_start: (?P<start>[0-9]+(\.?[0-9]*))$')
31+
silence_end_re = re.compile(' silence_end: (?P<end>[0-9]+(\.?[0-9]*)) ')
32+
total_duration_re = re.compile(
33+
'size=[^ ]+ time=(?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9\.]{5}) bitrate=')
34+
35+
36+
def _logged_popen(cmd_line, *args, **kwargs):
37+
logger.debug('Running command: {}'.format(subprocess.list2cmdline(cmd_line)))
38+
return subprocess.Popen(cmd_line, *args, **kwargs)
39+
40+
41+
def get_chunk_times(in_filename, silence_threshold, silence_duration, start_time=None, end_time=None):
42+
input_kwargs = {}
43+
if start_time is not None:
44+
input_kwargs['ss'] = start_time
45+
else:
46+
start_time = 0.
47+
if end_time is not None:
48+
input_kwargs['t'] = end_time - start_time
49+
50+
p = _logged_popen(
51+
(ffmpeg
52+
.input(in_filename, **input_kwargs)
53+
.filter_('silencedetect', n='{}dB'.format(silence_threshold), d=silence_duration)
54+
.output('-', format='null')
55+
.compile()
56+
) + ['-nostats'], # FIXME: use .nostats() once it's implemented in ffmpeg-python.
57+
stderr=subprocess.PIPE
58+
)
59+
output = p.communicate()[1].decode('utf-8')
60+
if p.returncode != 0:
61+
sys.stderr.write(output)
62+
sys.exit(1)
63+
logger.debug(output)
64+
lines = output.splitlines()
65+
66+
# Chunks start when silence ends, and chunks end when silence starts.
67+
chunk_starts = []
68+
chunk_ends = []
69+
for line in lines:
70+
silence_start_match = silence_start_re.search(line)
71+
silence_end_match = silence_end_re.search(line)
72+
total_duration_match = total_duration_re.search(line)
73+
if silence_start_match:
74+
chunk_ends.append(float(silence_start_match.group('start')))
75+
if len(chunk_starts) == 0:
76+
# Started with non-silence.
77+
chunk_starts.append(start_time or 0.)
78+
elif silence_end_match:
79+
chunk_starts.append(float(silence_end_match.group('end')))
80+
elif total_duration_match:
81+
hours = int(total_duration_match.group('hours'))
82+
minutes = int(total_duration_match.group('minutes'))
83+
seconds = float(total_duration_match.group('seconds'))
84+
end_time = hours * 3600 + minutes * 60 + seconds
85+
86+
if len(chunk_starts) == 0:
87+
# No silence found.
88+
chunk_starts.append(start_time)
89+
90+
if len(chunk_starts) > len(chunk_ends):
91+
# Finished with non-silence.
B41A 92+
chunk_ends.append(end_time or 10000000.)
93+
94+
return list(zip(chunk_starts, chunk_ends))
95+
96+
97+
def _makedirs(path):
98+
"""Python2-compatible version of ``os.makedirs(path, exist_ok=True)``."""
99+
try:
100+
os.makedirs(path)
101+
except OSError as exc:
102+
if exc.errno != errno.EEXIST or not os.path.isdir(path):
103+
raise
104+
105+
106+
def split_audio(
107+
in_filename,
108+
out_pattern,
109+
silence_threshold=DEFAULT_THRESHOLD,
110+
silence_duration=DEFAULT_DURATION,
111+
start_time=None,
112+
end_time=None,
113+
verbose=False,
114+
):
115+
chunk_times = get_chunk_times(in_filename, silence_threshold, silence_duration, start_time, end_time)
116+
117+
for i, (start_time, end_time) in enumerate(chunk_times):
118+
time = end_time - start_time
119+
out_filename = out_pattern.format(i, i=i)
120+
_makedirs(os.path.dirname(out_filename))
121+
122+
logger.info('{}: start={:.02f}, end={:.02f}, duration={:.02f}'.format(out_filename, start_time, end_time,
123+
time))
124+
_logged_popen(
125+
(ffmpeg
126+
.input(in_filename, ss=start_time, t=time)
127+
.output(out_filename)
128+
.overwrite_output()
129+
.compile()
130+
),
131+
stdout=subprocess.PIPE if not verbose else None,
132+
stderr=subprocess.PIPE if not verbose else None,
133+
).communicate()
134+
135+
136+
if __name__ == '__main__':
137+
kwargs = vars(parser.parse_args())
138+
if kwargs['verbose']:
139+
logging.basicConfig(level=logging.DEBUG, format='%(levels): %(message)s')
140+
logger.setLevel(logging.DEBUG)
141+
split_audio(**kwargs)

examples/transcribe.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env python
2+
from __future__ import unicode_literals
3+
4+
from google.cloud import speech
5+
from google.cloud.speech import enums
6+
from google.cloud.speech import types
7+
import argparse
8+
import ffmpeg
9+
import logging
10+
import subprocess
11+
import sys
12+
13+
14+
logging.basicConfig(level=logging.INFO, format='%(message)s')
15+
logger = logging.getLogger(__file__)
16+
logger.setLevel(logging.INFO)
17+
18+
19+
parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API')
20+
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
21+
22+
23+
def decode_audio(in_filename, **input_kwargs):
24+
p = subprocess.Popen(
25+
(ffmpeg
26+
.input(in_filename, **input_kwargs)
27+
.output('-', format='s16le', acodec='pcm_s16le', ac=1, ar='16k')
28+
.overwrite_output()
29+
.compile()
30+
),
31+
stdout=subprocess.PIPE,
32+
stderr=subprocess.PIPE
33+
)
34+
out = p.communicate()
35+
if p.returncode != 0:
36+
sys.stderr.write(out[1])
37+
sys.exit(1)
38+
return out[0]
39+
40+
41+
def get_transcripts(audio_data):
42+
client = speech.SpeechClient()
43+
audio = types.RecognitionAudio(content=audio_data)
44+
config = types.RecognitionConfig(
45+
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
46+
sample_rate_hertz=16000,
47+
language_code='en-US'
48+
)
49+
response = client.recognize(config, audio)
50+
return [result.alternatives[0].transcript for result in response.results]
51+
52+
53+
def transcribe(in_filename):
54+
audio_data = decode_audio(in_filename)
55+
transcripts = get_transcripts(audio_data)
56+
for transcript in transcripts:
57+
print(repr(transcript.encode('utf-8')))
58+
59+
60+
if __name__ == '__main__':
61+
args = parser.parse_args()
62+
transcribe(args.in_filename)

0 commit comments

Comments
 (0)
0