8000 Add examples: `split_silence` + `transcribe` by kkroening · Pull Request #52 · kkroening/ffmpeg-python · GitHub
[go: up one dir, main page]

Skip to content

Add examples: split_silence + transcribe #52

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 14, 2018
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ffmpeg
141 changes: 141 additions & 0 deletions examples/split_silence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env python
from __future__ import unicode_literals

import argparse
import errno
import ffmpeg
import logging
import os
import re
import subprocess
import sys


logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__file__)
logger.setLevel(logging.INFO)

DEFAULT_DURATION = 0.3
DEFAULT_THRESHOLD = -60

parser = argparse.ArgumentParser(description='Split media into separate chunks wherever silence occurs')
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
parser.add_argument('out_pattern', help='Output filename pattern (e.g. `out/chunk_{:04d}.wav`)')
parser.add_argument('--silence-threshold', default=DEFAULT_THRESHOLD, type=int, help='Silence threshold (in dB)')
parser.add_argument('--silence-duration', default=DEFAULT_DURATION, type=float, help='Silence duration')
parser.add_argument('--start-time', type=float, help='Start time (seconds)')
parser.add_argument('--end-time', type=float, help='End time (seconds)')
parser.add_argument('-v', dest='verbose', action='store_true', help='Verbose mode')

silence_start_re = re.compile(' silence_start: (?P<start>[0-9]+(\.?[0-9]*))$')
silence_end_re = re.compile(' silence_end: (?P<end>[0-9]+(\.?[0-9]*)) ')
total_duration_re = re.compile(
'size=[^ ]+ time=(?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9\.]{5}) bitrate=')


def _logged_popen(cmd_line, *args, **kwargs):
logger.debug('Running command: {}'.format(subprocess.list2cmdline(cmd_line)))
return subprocess.Popen(cmd_line, *args, **kwargs)


def get_chunk_times(in_filename, silence_threshold, silence_duration, start_time=None, end_time=None):
input_kwargs = {}
if start_time is not None:
input_kwargs['ss'] = start_time
else:
start_time = 0.
if end_time is not None:
input_kwargs['t'] = end_time - start_time

p = _logged_popen(
(ffmpeg
.input(in_filename, **input_kwargs)
.filter_('silencedetect', n='{}dB'.format(silence_threshold), d=silence_duration)
.output('-', format='null')
.compile()
) + ['-nostats'], # FIXME: use .nostats() once it's implemented in ffmpeg-python.
stderr=subprocess.PIPE
)
output = p.communicate()[1].decode('utf-8')
if p.returncode != 0:
sys.stderr.write(output)
sys.exit(1)
logger.debug(output)
lines = output.splitlines()

# Chunks start when silence ends, and chunks end when silence starts.
chunk_starts = []
chunk_ends = []
for line in lines:
silence_start_match = silence_start_re.search(line)
silence_end_match = silence_end_re.search(line)
total_duration_match = total_duration_re.search(line)
if silence_start_match:
chunk_ends.append(float(silence_start_match.group('start')))
if len(chunk_starts) == 0:
# Started with non-silence.
chunk_starts.append(start_time or 0.)
elif silence_end_match:
chunk_starts.append(float(silence_end_match.group('end')))
elif total_duration_match:
hours = int(total_duration_match.group('hours'))
minutes = int(total_duration_match.group('minutes'))
seconds = float(total_duration_match.group('seconds'))
end_time = hours * 3600 + minutes * 60 + seconds

if len(chunk_starts) == 0:
# No silence found.
chunk_starts.append(start_time)

if len(chunk_starts) > len(chunk_ends):
# Finished with non-silence.
chunk_ends.append(end_time or 10000000.)

return list(zip(chunk_starts, chunk_ends))


def _makedirs(path):
"""Python2-compatible version of ``os.makedirs(path, exist_ok=True)``."""
try:
os.makedirs(path)
except OSError as exc:
if exc.errno != errno.EEXIST or not os.path.isdir(path):
raise


def split_audio(
in_filename,
out_pattern,
silence_threshold=DEFAULT_THRESHOLD,
silence_duration=DEFAULT_DURATION,
start_time=None,
end_time=None,
verbose=False,
):
chunk_times = get_chunk_times(in_filename, silence_threshold, silence_duration, start_time, end_time)

for i, (start_time, end_time) in enumerate(chunk_times):
time = end_time - start_time
out_filename = out_pattern.format(i, i=i)
_makedirs(os.path.dirname(out_filename))

logger.info('{}: start={:.02f}, end={:.02f}, duration={:.02f}'.format(out_filename, start_time, end_time,
time))
_logged_popen(
(ffmpeg
.input(in_filename, ss=start_time, t=time)
.output(out_filename)
.overwrite_output()
.compile()
),
stdout=subprocess.PIPE if not verbose else None,
stderr=subprocess.PIPE if not verbose else None,
).communicate()


if __name__ == '__main__':
kwargs = vars(parser.parse_args())
if kwargs['verbose']:
logging.basicConfig(level=logging.DEBUG, format='%(levels): %(message)s')
logger.setLevel(logging.DEBUG)
split_audio(**kwargs)
62 changes: 62 additions & 0 deletions examples/transcribe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python
from __future__ import unicode_literals

from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
import argparse
import ffmpeg
import logging
import subprocess
import sys


logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__file__)
logger.setLevel(logging.INFO)


parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API')
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')


def decode_audio(in_filename, **input_kwargs):
p = subprocess.Popen(
(ffmpeg
.input(in_filename, **input_kwargs)
.output('-', format='s16le', acodec='pcm_s16le', ac=1, ar='16k')
.overwrite_output()
.compile()
),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
out = p.communicate()
if p.returncode != 0:
sys.stderr.write(out[1])
sys.exit(1)
return out[0]


def get_transcripts(audio_data):
client = speech.SpeechClient()
audio = types.RecognitionAudio(content=audio_data)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US'
)
response = client.recognize(config, audio)
return [result.alternatives[0].transcript for result in response.results]


def transcribe(in_filename):
audio_data = decode_audio(in_filename)
transcripts = get_transcripts(audio_data)
for transcript in transcripts:
print(repr(transcript.encode('utf-8')))


if __name__ == '__main__':
args = parser.parse_args()
transcribe(args.in_filename)
18 changes: 12 additions & 6 deletions ffmpeg/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,22 +131,28 @@ def get_args(stream_spec, overwrite_output=False):
return args


@output_operator()
def compile(stream_spec, cmd='ffmpeg', **kwargs):
"""Build command-line for ffmpeg."""
if isinstance(cmd, basestring):
cmd = [cmd]
elif type(cmd) != list:
cmd = list(cmd)
return cmd + get_args(stream_spec, **kwargs)


@output_operator()
def run(stream_spec, cmd='ffmpeg', **kwargs):
"""Run ffmpeg on node graph.

Args:
**kwargs: keyword-arguments passed to ``get_args()`` (e.g. ``overwrite_output=True``).
"""
if isinstance(cmd, basestring):
cmd = [cmd]
elif type(cmd) != list:
cmd = list(cmd)
args = cmd + get_args(stream_spec, **kwargs)
_subprocess.check_call(args)
_subprocess.check_call(compile(stream_spec, cmd, **kwargs))


__all__ = [
'compile',
'get_args',
'run',
]
6 changes: 6 additions & 0 deletions ffmpeg/tests/test_ffmpeg.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,12 @@ def _get_drawtext_text_repr(text):
# subprocess.check_call(['ffmpeg', '-version'])


def test_compile():
out_file = ffmpeg.input('dummy.mp4').output('dummy2.mp4')
assert out_file.compile() == ['ffmpeg', '-i', 'dummy.mp4', 'dummy2.mp4']
assert out_file.compile(cmd='ffmpeg.old') == ['ffmpeg.old', '-i', 'dummy.mp4', 'dummy2.mp4']


def test_run():
stream = _get_complex_filter_example()
ffmpeg.run(stream)
Expand Down
0