hanv89
diff --git a/‎pubsub/streaming-analytics/PubSubToGCS.py
Lines changed: 54 additions & 0 deletions b/‎pubsub/streaming-analytics/PubSubToGCS.py
Lines changed: 54 additions & 0 deletions
diff --git a/‎pubsub/streaming-analytics/README.md
Lines changed: 7 additions & 4 deletions b/‎pubsub/streaming-analytics/README.md
Lines changed: 7 additions & 4 deletions
diff --git a/‎pubsub/streaming-analytics/requirements.txt
Lines changed: 1 addition & 0 deletions b/‎pubsub/streaming-analytics/requirements.txt
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,54 @@
+import argparse
+import logging
+
+import apache_beam as beam
+import apache_beam.transforms.window as window
+from apache_beam.options.pipeline_options import PipelineOptions
+from apache_beam.options.pipeline_options import SetupOptions
+from apache_beam.options.pipeline_options import StandardOptions
+
+
+def run(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--input_topic',
+        help=('The Cloud Pub/Sub topic to read from.'
+              '"projects/<PROJECT_NAME>/topics/<TOPIC_NAME>".'))
+    parser.add_argument(
+        '--window_size',
+        type=int,
+        default=1,
+        help=('Output file\'s window size in number of minutes.'))
+    parser.add_argument(
+        '--output_path',
+        help=('GCS Path of the output file including filename prefix.'))
+    known_args, pipeline_args = parser.parse_known_args(argv)
+
+    # One or more DoFn's rely on global context.
+    pipeline_options = PipelineOptions(pipeline_args)
+    pipeline_options.view_as(SetupOptions).save_main_session = True
+    pipeline_options.view_as(StandardOptions).streaming = True
+    p = beam.Pipeline(options=pipeline_options)
+
+    # Read from Cloud Pub/Sub into a PCollection.
+    if known_args.input_topic:
+        messages = (p
+                    | 'Read Pub/Sub Messages' >> beam.io.ReadFromPubSub(
+                        topic=known_args.input_topic)
+                    .with_output_types(bytes))
+
+    # Group messages by fixed-sized minute intervals.
+    transformed = (messages
+                   | beam.WindowInto(
+                       window.FixedWindows(known_args.window_size))
+                   .with_output_types(bytes))
+
+    # Output to GCS
+    transformed | beam.io.WriteToText(file_path_prefix=known_args.output_path)
+
+    result = p.run()
+    result.wait_until_finish()
+
+if __name__ == '__main__': # noqa
+    logging.getLogger().setLevel(logging.INFO)
+    run()
@@ -106,14 +106,17 @@ The following example will run a streaming pipeline. It will read messages from
 + `--output`: sets the output GCS path prefix to write files to
 + `--runner [optional]`: specifies the runner to run the pipeline, defaults to `DirectRunner`
 + `--windowSize [optional]`: specifies the window size in minutes, defaults to 1
++ `--temp_location`: needed for execution of the pipeline
 
 ```bash
-python PubSubToGCS.py \
+python -m PubSubToGCS \
   --project=$PROJECT_NAME \
-  --inputTopic=projects/$PROJECT_NAME/topics/cron-topic \
-  --output=gs://$BUCKET_NAME/samples/output \
+  --input_topic=projects/$PROJECT_NAME/topics/june \
+  --output_path=gs://$BUCKET_NAME/labor \
   --runner=DataflowRunner \
-  --windowSize=2
+  --window_size=2 \
+  --temp_location=gs://$BUCKET_NAME/temp \
+  --experiments=allow_non_updateable_job
 ```
 
 After the job has been submitted, you can check its status in the [GCP Console Dataflow page].
 
@@ -1 +1,2 @@
 google-api-python-client==1.7.9
+apache-beam==2.15.0
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`google-api-python-client==1.7.9`
	`2`	`+apache-beam==2.15.0`