20
20
import com .google .api .gax .rpc .ClientStream ;
21
21
import com .google .api .gax .rpc .ResponseObserver ;
22
22
import com .google .api .gax .rpc .StreamController ;
23
- import com .google .cloud .speech .v1 .RecognitionConfig ;
24
- import com .google .cloud .speech .v1 .SpeechClient ;
25
- import com .google .cloud .speech .v1 .SpeechRecognitionAlternative ;
26
- import com .google .cloud .speech .v1 .StreamingRecognitionConfig ;
27
- import com .google .cloud .speech .v1 .StreamingRecognitionResult ;
28
- import com .google .cloud .speech .v1 .StreamingRecognizeRequest ;
29
- import com .google .cloud .speech .v1 .StreamingRecognizeResponse ;
23
+ import com .google .cloud .speech .v1p1beta1 .RecognitionConfig ;
24
+ import com .google .cloud .speech .v1p1beta1 .SpeechClient ;
25
+ import com .google .cloud .speech .v1p1beta1 .SpeechRecognitionAlternative ;
26
+ import com .google .cloud .speech .v1p1beta1 .StreamingRecognitionConfig ;
27
+ import com .google .cloud .speech .v1p1beta1 .StreamingRecognitionResult ;
28
+ import com .google .cloud .speech .v1p1beta1 .StreamingRecognizeRequest ;
29
+ import com .google .cloud .speech .v1p1beta1 .StreamingRecognizeResponse ;
30
30
import com .google .protobuf .ByteString ;
31
+ import com .google .protobuf .Duration ;
32
+ import java .lang .Math ;
33
+ import java .text .DecimalFormat ;
31
34
import java .util .ArrayList ;
32
35
import java .util .concurrent .BlockingQueue ;
33
36
import java .util .concurrent .LinkedBlockingQueue ;
39
42
40
43
public class InfiniteStreamRecognize {
41
44
45
+ private static final int STREAMING_LIMIT = 10000 ; // 10 seconds
46
+
47
+ public static final String RED = "\033 [0;31m" ;
48
+ public static final String GREEN = "\033 [0;32m" ;
49
+ public static final String YELLOW = "\033 [0;33m" ;
50
+
42
51
// Creating shared object
43
52
private static volatile BlockingQueue <byte []> sharedQueue = new LinkedBlockingQueue ();
44
53
private static TargetDataLine targetDataLine ;
45
54
private static int BYTES_PER_BUFFER = 6400 ; // buffer size in bytes
46
55
56
+ private static int restartCounter = 0 ;
57
+ private static ArrayList <ByteString > audioInput = new ArrayList <ByteString >();
58
+ private static ArrayList <ByteString > lastAudioInput = new ArrayList <ByteString >();
59
+ private static int resultEndTimeInMS = 0 ;
60
+ private static int isFinalEndTime = 0 ;
61
+ private static int finalRequestEndTime = 0 ;
62
+ private static boolean newStream = true ;
63
+ private static double bridgingOffset = 0 ;
64
+ private static boolean lastTranscriptWasFinal = false ;
65
+ private static StreamController referenceToStreamController ;
66
+ private static ByteString tempByteString ;
67
+
47
68
public static void main (String ... args ) {
48
69
try {
49
70
infiniteStreamingRecognize ();
@@ -60,6 +81,7 @@ class MicBuffer implements Runnable {
60
81
61
82
@ Override
62
83
public void run () {
84
+ System .out .println (YELLOW );
63
85
System .out .println ("Start speaking...Press Ctrl-C to stop" );
64
86
targetDataLine .start ();
65
87
byte [] data = new byte [BYTES_PER_BUFFER ];
@@ -88,24 +110,48 @@ public void run() {
88
110
89
111
ArrayList <StreamingRecognizeResponse > responses = new ArrayList <>();
90
112
91
- public void onStart (StreamController controller) {}
113
+ public void onStart (StreamController controller ) {
114
+ referenceToStreamController = controller ;
115
+ }
92
116
93
117
public void onResponse (StreamingRecognizeResponse response ) {
118
+
94
119
responses .add (response );
120
+
95
121
StreamingRecognitionResult result = response .getResultsList ().get (0 );
96
- // There can be several alternative transcripts for a given chunk of speech. Just
97
- // use the first (most likely) one here.
122
+
123
+ Duration resultEndTime = result .getResultEndTime ();
124
+
125
+ resultEndTimeInMS = (int ) ((resultEndTime .getSeconds () * 1000 )
126
+ + (resultEndTime .getNanos () / 1000000 ));
127
+
128
+ double correctedTime = resultEndTimeInMS - bridgingOffset
129
+ + (STREAMING_LIMIT * restartCounter );
130
+ DecimalFormat format = new DecimalFormat ("0.#" );
131
+
98
132
SpeechRecognitionAlternative alternative = result .getAlternativesList ().get (0 );
99
- System .out .printf ("Transcript : %s\n " , alternative .getTranscript ());
100
- }
133
+ if (result .getIsFinal ()) {
134
+ System .out .print (GREEN );
135
+ System .out .print ("\033 [2K\r " );
136
+ System .out .printf ("%s: %s\n " , format .format (correctedTime ),
137
+ alternative .getTranscript ());
101
138
102
- public void onComplete () {
103
- System .out .println ("Done" );
104
- }
139
+ isFinalEndTime = resultEndTimeInMS ;
140
+ lastTranscriptWasFinal = true ;
141
+ } else {
142
+ System .out .print (RED );
143
+ System .out .print ("\033 [2K\r " );
144
+ System .out .printf ("%s: %s" , format .format (correctedTime ),
145
+ alternative .getTranscript ());
105
146
106
- public void onError ( Throwable t ) {
107
- System . out . println ( t );
147
+ lastTranscriptWasFinal = false ;
148
+ }
108
149
}
150
+
151
+ public void onComplete () {}
152
+
153
+ public void onError (Throwable t ) {}
154
+
109
155
};
110
156
111
157
clientStream = client .streamingRecognizeCallable ().splitCall (responseObserver );
@@ -116,8 +162,12 @@ public void onError(Throwable t) {
116
162
.setLanguageCode ("en-US" )
117
163
.setSampleRateHertz (16000 )
118
164
.build ();
165
+
119
166
StreamingRecognitionConfig streamingRecognitionConfig =
120
- StreamingRecognitionConfig .newBuilder ().setConfig (recognitionConfig ).build ();
167
+ StreamingRecognitionConfig .newBuilder ()
168
+ .setConfig (recognitionConfig )
169
+ .setInterimResults (true )
170
+ .build ();
121
171
122
172
StreamingRecognizeRequest request =
123
173
StreamingRecognizeRequest .newBuilder ()
@@ -151,23 +201,84 @@ public void onError(Throwable t) {
151
201
152
202
long estimatedTime = System .currentTimeMillis () - startTime ;
153
203
154
- if (estimatedTime >= 55000 ) {
204
+ if (estimatedTime >= STREAMING_LIMIT ) {
155
205
156
206
clientStream .closeSend ();
207
+ referenceToStreamController .cancel (); // remove Observer
208
+
209
+ if (resultEndTimeInMS > 0 ) {
210
+ finalRequestEndTime = isFinalEndTime ;
211
+ }
212
+ resultEndTimeInMS = 0 ;
213
+
214
+ lastAudioInput = null ;
215
+ lastAudioInput = audioInput ;
216
+ audioInput = new ArrayList <ByteString >();
217
+
218
+ restartCounter ++;
219
+
220
+ if (!lastTranscriptWasFinal ) {
221
+ System .out .print ('\n' );
222
+ }
223
+
224
+ newStream = true ;
225
+
157
226
clientStream = client .streamingRecognizeCallable ().splitCall (responseObserver );
158
227
159
228
request =
160
229
StreamingRecognizeRequest .newBuilder ()
161
230
.setStreamingConfig (streamingRecognitionConfig )
162
231
.build ();
163
232
233
+ System .out .println (YELLOW );
234
+ System .out .printf ("%d: RESTARTING REQUEST\n " , restartCounter * STREAMING_LIMIT );
235
+
164
236
startTime = System .currentTimeMillis ();
165
237
166
238
} else {
239
+
240
+ if ((newStream ) && (lastAudioInput .size () > 0 )) {
241
+ // if this is the first audio from a new request
242
+ // calculate amount of unfinalized audio from last request
243
+ // resend the audio to the speech client before incoming audio
244
+ double chunkTime = STREAMING_LIMIT / lastAudioInput .size ();
245
+ // ms length of each chunk in previous request audio arrayList
246
+ if (chunkTime != 0 ) {
247
+ if (bridgingOffset < 0 ) {
248
+ // bridging Offset accounts for time of resent audio
249
+ // calculated from last request
250
+ bridgingOffset = 0 ;
251
+ }
252
+ if (bridgingOffset > finalRequestEndTime ) {
253
+ bridgingOffset = finalRequestEndTime ;
254
+ }
255
+ int chunksFromMS = (int ) Math .floor ((finalRequestEndTime
256
+ - bridgingOffset ) / chunkTime );
257
+ // chunks from MS is number of chunks to resend
258
+ bridgingOffset = (int ) Math .floor ((lastAudioInput .size ()
259
+ - chunksFromMS ) * chunkTime );
260
+ // set bridging offset for next request
261
+ for (int i = chunksFromMS ; i < lastAudioInput .size (); i ++) {
262
+
263
+ request =
264
+ StreamingRecognizeRequest .newBuilder ()
265
+ .setAudioContent (lastAudioInput .get (i ))
266
+ .build ();
267
+ clientStream .send (request );
268
+ }
269
+ }
270
+ newStream = false ;
271
+ }
272
+
273
+ tempByteString = ByteString .copyFrom (sharedQueue .take ());
274
+
167
275
request =
168
276
StreamingRecognizeRequest .newBuilder ()
169
- .setAudioContent (ByteString . copyFrom ( sharedQueue . take ()) )
277
+ .setAudioContent (tempByteString )
170
278
.build ();
279
+
280
+ audioInput .add (tempByteString );
281
+
171
282
}
172
283
173
284
clientStream .send (request );
0 commit comments