8000 Improve logging in split_silence; add transcribe example · ronggan/ffmpeg-python@f5f7ee2 · GitHub
[go: up one dir, main page]

Skip to content

Commit f5f7ee2

Browse files
committed
Improve logging in split_silence; add transcribe example
1 parent ad58a38 commit f5f7ee2

File tree

2 files changed

+88
-12
lines changed

2 files changed

+88
-12
lines changed

examples/split_silence.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import subprocess
1111
import sys
1212

13+
1314
logging.basicConfig(level=logging.INFO, format='%(message)s')
1415
logger = logging.getLogger(__file__)
1516
logger.setLevel(logging.INFO)
@@ -24,14 +25,19 @@
2425
parser.add_argument('--silence-duration', default=DEFAULT_DURATION, type=float, help='Silence duration')
2526
parser.add_argument('--start-time', type=float, help='Start time (seconds)')
2627
parser.add_argument('--end-time', type=float, help='End time (seconds)')
27-
28+
parser.add_argument('-v', dest='verbose', action='store_true', help='Verbose mode')
2829

2930
silence_start_re = re.compile(' silence_start: (?P<start>[0-9]+(\.?[0-9]*))$')
3031
silence_end_re = re.compile(' silence_end: (?P<end>[0-9]+(\.?[0-9]*)) ')
3132
total_duration_re = re.compile(
3233
'size=[^ ]+ time=(?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9\.]{5}) bitrate=')
3334

3435

36+
def _logged_popen(cmd_line, *args, **kwargs):
37+
logger.debug('Running command: {}'.format(subprocess.list2cmdline(cmd_line)))
38+
return subprocess.Popen(cmd_line, *args, **kwargs)
39+
40+
3541
def get_chunk_times(in_filename, silence_threshold, silence_duration, start_time=None, end_time=None):
3642
input_kwargs = {}
3743
if start_time is not None:
@@ -41,17 +47,20 @@ def get_chunk_times(in_filename, silence_threshold, silence_duration, start_time
4147
if end_time is not None:
4248
input_kwargs['t'] = end_time - start_time
4349

44-
args = (ffmpeg
45-
.input(in_filename, **input_kwargs)
46-
.filter_('silencedetect', n='{}dB'.format(silence_threshold), d=silence_duration)
47-
.output('-', format='null')
48-
.get_args()
50+
p = _logged_popen(
51+
(ffmpeg
52+
.input(in_filename, **input_kwargs)
53+
.filter_('silencedetect', n='{}dB'.format(silence_threshold), d=silence_duration)
54+
.output('-', format='null')
55+
.compile()
56+
) + ['-nostats'], # FIXME: use .nostats() once it's implemented in ffmpeg-python.
57+
stderr=subprocess.PIPE
4958
)
50-
p = subprocess.Popen(['ffmpeg'] + args, stderr=subprocess.PIPE)
5159
output = p.communicate()[1].decode('utf-8')
5260
if p.returncode != 0:
5361
sys.stderr.write(output)
5462
sys.exit(1)
63+
logger.debug(output)
5564
lines = output.splitlines()
5665

5766
# Chunks start when silence ends, and chunks end when silence starts.
@@ -93,13 +102,15 @@ def _makedirs(path):
93102
if exc.errno != errno.EEXIST or not os.path.isdir(path):
94103
raise
95104

105+
96106
def split_audio(
97107
in_filename,
98108
out_pattern,
99109
silence_threshold=DEFAULT_THRESHOLD,
100110
silence_duration=DEFAULT_DURATION,
101111
start_time=None,
102112
end_time=None,
113+
verbose=False,
103114
):
104115
chunk_times = get_chunk_times(in_filename, silence_threshold, silence_duration, start_time, end_time)
105116

@@ -110,18 +121,21 @@ def split_audio(
110121

111122
logger.info('{}: start={:.02f}, end={:.02f}, duration={:.02f}'.format(out_filename, start_time, end_time,
112123
time))
113-
subprocess.Popen(
124+
_logged_popen(
114125
(ffmpeg
115126
.input(in_filename, ss=start_time, t=time)
116127
.output(out_filename)
117128
.overwrite_output()
118129
.compile()
119130
),
120-
stdout=subprocess.PIPE,
121-
stderr=subprocess.PIPE,
131+
stdout=subprocess.PIPE if not verbose else None,
132+
stderr=subprocess.PIPE if not verbose else None,
122133
).communicate()
123134

124135

125136
if __name__ == '__main__':
126-
args = parser.parse_args()
127-
split_audio(**vars(args))
137+
kwargs = vars(parser.parse_args())
138+
if kwargs['verbose']:
139+
logging.basicConfig(level=logging.DEBUG, format='%(levels): %(message)s')
140+
logger.setLevel(logging.DEBUG)
141+
split_audio(**kwargs)

examples/transcribe.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env python
2+
from __future__ import unicode_literals
3+
4+
from google.cloud import speech
5+
from google.cloud.speech import enums
6+
from google.cloud.speech import types
7+
import argparse
8+
import ffmpeg
9+
import logging
10+
import subprocess
11+
import sys
12+
13+
14+
logging.basicConfig(level=logging.INFO, format='%(message)s')
15+
logger = logging.getLogger(__file__)
16+
logger.setLevel(logging.INFO)
17+
18+
19+
parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API')
20+
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
21+
22+
23+
def decode_audio(in_filename, **input_kwargs):
24+
p = subprocess.Popen(
25+
(ffmpeg
26+
.input(in_filename, **input_kwargs)
27+
.output('-', format='s16le', acodec='pcm_s16le', ac=1, ar='16k')
28+
.overwrite_output()
29+
.compile()
30+
),
31+
stdout=subprocess.PIPE,
32+
stderr=subprocess.PIPE
33+
)
34+
out = p.communicate()
35+
if p.returncode != 0:
36+
sys.stderr.write(out[1])
37+
sys.exit(1)
38+
return out[0]
39+
40+
41+
def get_transcripts(audio_data):
42+
client = speech.SpeechClient()
43+
audio = types.RecognitionAudio(content=audio_data)
44+
config = types.RecognitionConfig(
45+
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
46+
sample_rate_hertz=16000,
47+
language_code='en-US'
48+
)
49+
response = client.recognize(config, audio)
50+
return [result.alternatives[0].transcript for result in response.results]
51+
52+
53+
def transcribe(in_filename):
54+
audio_data = decode_audio(in_filename)
55+
transcripts = get_transcripts(audio_data)
56+
for transcript in transcripts:
57+
print(repr(transcript.encode('utf-8')))
58+
59+
60+
if __name__ == '__main__':
61+
args = parser.parse_args()
62+
transcribe(args.in_filename)

0 commit comments

Comments
 (0)
0