8000 updated algebird streaming demos, updated, nlp notebooks, cleaned up … · codeaudit/pipeline-1@5f3d022 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5f3d022

Browse files
committed
updated algebird streaming demos, updated, nlp notebooks, cleaned up Dockerfile, reduced the spark-defaults in terms of cpu default
1 parent 835d110 commit 5f3d022

File tree

19 files changed

+1192
-440
lines changed

19 files changed

+1192
-440
lines changed

Dockerfile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,8 +232,6 @@ RUN \
232232
# This is temporary while we figure out how to specify the following dependency as a --package into Spark (note `models` classifier)
233233
# edu.stanford.corenlp:stanford-corenlp:${STANFORD_CORENLP_VERSION}:models
234234
# Classifiers don't appear to be supported by --packages
235-
# && cd ~/pipeline/myapps/ml/lib \
236-
# && wget http://search.maven.org/remotecontent?filepath=edu/stanford/nlp/stanford-corenlp/${STANFORD_CORENLP_VERSION}/stanford-corenlp-${STANFORD_CORENLP_VERSION}-models.jar \
237235
&& cd ~ \
238236
&& wget http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip \
239237
&& unzip stanford-corenlp-full-2015-12-09.zip \

config/.vim/.netrwhist

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
let g:netrw_dirhistmax =10
2-
let g:netrw_dirhist_cnt =7
2+
let g:netrw_dirhist_cnt =9
33
let g:netrw_dirhist_1='/root/pipeline/config/logstash'
44
let g:netrw_dirhist_2='/root/pipeline/flows/flowfile_repository'
55
let g:netrw_dirhist_3='/root/pipeline/myapps/ml/src/main/scala/com/advancedspark/ml/graph'
66
let g:netrw_dirhist_4='/root/pipeline/config/zeppelin'
77
let g:netrw_dirhist_5='/root/pipeline/myapps/ml/src/main/scala/com/advancedspark/ml/graph'
88
let g:netrw_dirhist_6='/root/spark-1.6.0-bin-fluxcapacitor/tachyon'
99
let g:netrw_dirhist_7='/root/spark-1.6.0-bin-fluxcapacitor/tachyon/bin'
10+
let g:netrw_dirhist_8='/root/pipeline/myapps/streaming'
11+
let g:netrw_dirhist_9='/root/pipeline/myapps/streaming/src/main/scala/com/advancedspark/streaming/rating/approx'

config/bash/.profile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ export KEYSTONEML_VERSION=0.2
4040
export SPARK_HASH_VERSION=0.1.3
4141
export NIFI_VERSION=0.4.1
4242
export SPARK_NIFI_CONNECTOR_VERSION=0.4.1
43-
export ELSEVIER_XML_UTILS_VERSION=1.3.0
43+
export SPARK_XML_VERSION=0.3.1
4444

4545
#Dev Install
4646
export DEV_INSTALL_HOME=~
@@ -159,8 +159,9 @@ export SBT_OPTS="-Xmx10G -XX:+CMSClassUnloadingEnabled"
159159
export MYAPPS_HOME=$PIPELINE_HOME/myapps
160160

161161
# --packages used to pass into our Spark jobs
162-
export SPARK_SUBMIT_PACKAGES=org.apache.spark:spark-streaming-kafka-assembly_2.10:$SPARK_VERSION,org.elasticsearch:elasticsearch-spark_2.10:$SPARK_ELASTICSEARCH_CONNECTOR_VERSION,com.datastax.spark:spark-cassandra-connector_2.10:$SPARK_CASSANDRA_CONNECTOR_VERSION,redis.clients:jedis:$JEDIS_VERSION,com.twitter:algebird-core_2.10:$ALGEBIRD_VERSION,com.databricks:spark-avro_2.10:$SPARK_AVRO_CONNECTOR_VERSION,com.databricks:spark-csv_2.10:$SPARK_CSV_CONNECTOR_VERSION,org.apache.nifi:nifi-spark-receiver:$SPARK_NIFI_CONNECTOR_VERSION,brkyvz:streaming-matrix-factorization:$STREAMING_MATRIX_FACTORIZATION_VERSION,com.madhukaraphatak:java-sizeof_2.10:0.1,elsevierlabs-os:spark-xml-utils:$ELSEVIER_XML_UTILS_VERSION,edu.stanford.nlp:stanford-corenlp:$STANFORD_CORENLP_VERSION
162+
export SPARK_SUBMIT_PACKAGES=org.apache.spark:spark-streaming- 9E88 kafka-assembly_2.10:$SPARK_VERSION,org.elasticsearch:elasticsearch-spark_2.10:$SPARK_ELASTICSEARCH_CONNECTOR_VERSION,com.datastax.spark:spark-cassandra-connector_2.10:$SPARK_CASSANDRA_CONNECTOR_VERSION,redis.clients:jedis:$JEDIS_VERSION,com.twitter:algebird-core_2.10:$ALGEBIRD_VERSION,com.databricks:spark-avro_2.10:$SPARK_AVRO_CONNECTOR_VERSION,com.databricks:spark-csv_2.10:$SPARK_CSV_CONNECTOR_VERSION,org.apache.nifi:nifi-spark-receiver:$SPARK_NIFI_CONNECTOR_VERSION,brkyvz:streaming-matrix-factorization:$STREAMING_MATRIX_FACTORIZATION_VERSION,com.madhukaraphatak:java-sizeof_2.10:0.1,com.databricks:spark-xml_2.10:$SPARK_XML_VERSION,edu.stanford.nlp:stanford-corenlp:$STANFORD_CORENLP_VERSION
163163

164+
# We still need to include a reference to a local stanford-corenlp-$STANFORD_CORENLP_VERSION-models.jar because SparkSubmit doens't support a classifier in --packages
164165
export SPARK_SUBMIT_JARS=$MYSQL_CONNECTOR_JAR,$MYAPPS_HOME/ml/lib/spark-corenlp_2.10-0.1.jar,$MYAPPS_HOME/ml/lib/stanford-corenlp-$STANFORD_CORENLP_VERSION-models.jar,$MYAPPS_HOME/ml/target/scala-2.10/ml_2.10-1.0.jar,$MYAPPS_HOME/sql/target/scala-2.10/sql_2.10-1.0.jar,$MYAPPS_HOME/core/target/scala-2.10/core_2.10-1.0.jar,$MYAPPS_HOME/streaming/target/scala-2.10/streaming_2.10-1.0.jar
165166

166167
# Zeppelin

config/spark/spark-defaults.conf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@ spark.master=spark://127.0.0.1:7077
1313
#spark.executor.cores=10
1414
#spark.executor.memory=10240m
1515
#spark.executor.instances=4
16-
spark.default.parallelism=16
16+
spark.default.parallelism=4
1717
spark.shuffle.service.enabled=false
1818
spark.dynamicAllocation.enabled=false
1919
spark.dynamicAllocation.minExecutors=1
2020
spark.dynamicAllocation.maxExecutors=4
2121
#spark.streaming.backpressure.enabled=true
22-
spark.deploy.defaultCores=4
22+
spark.deploy.defaultCores=1
2323
spark.eventLog.enabled=true
2424
spark.eventLog.dir=/root/pipeline/logs/spark
2525
spark.history.fs.logDirectory=/root/pipeline/logs/spark/spark-events

config/zeppelin/interpreter.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,16 @@
525525
"2AS9P7JSA",
526526
"2ARR8UZDJ",
527527
"2AR33ZMZJ"
528+
],
529+
"2BC4DHMA8": [
530+
"2AS9P7JSA",
531+
"2ARR8UZDJ",
532+
"2AR33ZMZJ"
533+
],
534+
"2BAXQPPAV": [
535+
"2AS9P7JSA",
536+
"2ARR8UZDJ",
537+
"2AR33ZMZJ"
528538
]
529539
}
530540
}

data_persist/zeppelin/2AUUDPT56/note.json

Lines changed: 47 additions & 47 deletions
Large diffs are not rendered by default.

data_persist/zeppelin/2B42HQF6Z/note.json

Lines changed: 55 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"paragraphs": [
33
{
44
"text": "%md \n### Streaming Approximation Code Examples\n[Redis HyperLogLog (HLL)](https://github.com/fluxcapacitor/pipeline/blob/master/myapps/streaming/src/main/scala/com/advancedspark/streaming/rating/approx/RedisHyperLogLog.scala)\n\n[Algebird HyperLogLog (HLL)](https://github.com/fluxcapacitor/pipeline/blob/master/myapps/streaming/src/main/scala/com/advancedspark/streaming/rating/approx/AlgebirdHyperLogLog.scala)\n\n[Algebird CountMin Sketch TopK (CMS)](https://github.com/fluxcapacitor/pipeline/blob/master/myapps/streaming/src/main/scala/com/advancedspark/streaming/rating/approx/AlgebirdCountMinSketchTopK.scala)",
5-
"dateUpdated": "Jan 16, 2016 3:49:35 PM",
5+
"dateUpdated": "Jan 29, 2016 4:24:20 AM",
66
"config": {
77
"colWidth": 12.0,
88
"graph": {
@@ -16,7 +16,8 @@
1616
},
1717
"enabled": true,
1818
"editorMode": "ace/mode/markdown",
19-
"editorHide": true
19+
"editorHide": true,
20+
"tableHide": false
2021
},
2122
"settings": {
2223
"params": {},
@@ -30,15 +31,15 @@
3031
"msg": "\u003ch3\u003eStreaming Approximation Code Examples\u003c/h3\u003e\n\u003cp\u003e\u003ca href\u003d\"https://github.com/fluxcapacitor/pipeline/blob/master/myapps/streaming/src/main/scala/com/advancedspark/streaming/rating/approx/RedisHyperLogLog.scala\"\u003eRedis HyperLogLog (HLL)\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003ca href\u003d\"https://github.com/fluxcapacitor/pipeline/blob/master/myapps/streaming/src/main/scala/com/advancedspark/streaming/rating/approx/AlgebirdHyperLogLog.scala\"\u003eAlgebird HyperLogLog (HLL)\u003c/a\u003e\u003c/p\u003e\n\u003cp\u003e\u003ca href\u003d\"https://github.com/fluxcapacitor/pipeline/blob/master/myapps/streaming/src/main/scala/com/advancedspark/streaming/rating/approx/AlgebirdCountMinSketchTopK.scala\"\u003eAlgebird CountMin Sketch TopK (CMS)\u003c/a\u003e\u003c/p\u003e\n"
3132
},
3233
"dateCreated": "Nov 23, 2015 4:48:23 PM",
33-
"dateStarted": "Jan 16, 2016 3:49:27 PM",
34-
"dateFinished": "Jan 16, 2016 3:49:27 PM",
34+
"dateStarted": "Jan 29, 2016 4:01:44 AM",
35+
"dateFinished": "Jan 29, 2016 4:01:44 AM",
3536
"status": "FINISHED",
3637
"progressUpdateIntervalMs": 500
3738
},
3839
{
39-
"title": "Retrieve from Redis",
40-
"text": "import redis.clients.jedis.Jedis\n\nval itemId \u003d 7 // Spark\n\nval jedis \u003d new Jedis(\"127.0.0.1\", 6379)\n\nval exactNumRatingsForItemId \u003d jedis.get(s\"\"\"exact-rating-count:${itemId}\"\"\")\nval exactNumDistinctUsersWhoRatedItemId \u003d jedis.scard(s\"\"\"exact-distinct-user-rating-count:${itemId}\"\"\")\nval approxNumDistinctUsersWhoRatedItemId \u003d jedis.pfcount(s\"\"\"approx-distinct-user-rating-count:${itemId}\"\"\")",
41-
"dateUpdated": "Jan 19, 2016 9:08:35 PM",
40+
"title": "Approximate HyperLogLog (Redis)",
41+
"text": "import redis.clients.jedis.Jedis\n\nval itemId \u003d 1 // Spark\n\nval jedis \u003d new Jedis(\"127.0.0.1\", 6379)\n\nval approxNumDistinctUsersWhoRatedItemId \u003d jedis.pfcount(s\"\"\"approx-distinct-user-rating-count:${itemId}\"\"\")",
42+
"dateUpdated": "Jan 29, 2016 4:24:20 AM",
4243
"config": {
4344
"colWidth": 12.0,
4445
"graph": {
@@ -52,7 +53,8 @@
5253
},
5354
"editorMode": "ace/mode/scala",
5455
"enabled": true,
55-
"title": true
56+
"title": true,
57+
"tableHide": false
5658
},
5759
"settings": {
5860
"params": {},
@@ -61,18 +63,20 @@
6163
"jobName": "paragraph_1448297306376_418047956",
6264
"id": "20151123-164826_892246958",
6365
"result": {
64-
"code": "ERROR",
66+
"code": "SUCCESS",
6567
"type": "TEXT",
66-
"msg": "File name too long"
68+
"msg": "import redis.clients.jedis.Jedis\nitemId: Int \u003d 1\njedis: redis.clients.jedis.Jedis \u003d redis.clients.jedis.Jedis@29db7d87\napproxNumDistinctUsersWhoRatedItemId: Long \u003d 6\n"
6769
},
6870
"dateCreated": "Nov 23, 2015 4:48:26 PM",
69-
"dateStarted": "Jan 16, 2016 3:49:39 PM",
70-
"dateFinished": "Jan 16, 2016 3:49:40 PM",
71-
"status": "ERROR",
71+
"dateStarted": "Jan 29, 2016 4:23:12 AM",
72+
"dateFinished": "Jan 29, 2016 4:23:13 AM",
73+
"status": "FINISHED",
7274
"progressUpdateIntervalMs": 500
7375
},
7476
{
75-
"dateUpdated": "Jan 16, 2016 5:31:31 AM",
77+
"title": "Exact Count (Redis)",
78+
"text": "val exactNumRatingsForItemId \u003d jedis.get(s\"\"\"exact-rating-count:${itemId}\"\"\")\nval exactNumDistinctUsersWhoRatedItemId \u003d jedis.scard(s\"\"\"exact-distinct-user-rating-count:${itemId}\"\"\")",
79+
"dateUpdated": "Jan 29, 2016 4:24:20 AM",
7680
"config": {
7781
"colWidth": 12.0,
7882
"graph": {
@@ -85,7 +89,10 @@
8589
"scatter": {}
8690
},
8791
"enabled": true,
88-
"editorMode": "ace/mode/scala"
92+
"editorMode": "ace/mode/scala",
93+
"editorHide": false,
94+
"title": true,
95+
"tableHide": false
8996
},
9097
"settings": {
9198
"params": {},
@@ -95,16 +102,45 @@
95102
"id": "20160115-050946_110716227",
96103
"result": {
97104
"code": "SUCCESS",
98-
"type": "TEXT"
105+
"type": "TEXT",
106+
"msg": "exactNumRatingsForItemId: String \u003d 7\nexactNumDistinctUsersWhoRatedItemId: Long \u003d 5\n"
99107
},
100108
"dateCreated": "Jan 15, 2016 5:09:46 AM",
101-
"dateStarted": "Jan 16, 2016 5:31:32 AM",
102-
"dateFinished": "Jan 16, 2016 5:31:32 AM",
109+
"dateStarted": "Jan 29, 2016 4:23:15 AM",
110+
"dateFinished": "Jan 29, 2016 4:23:16 AM",
103111
"status": "FINISHED",
104112
"progressUpdateIntervalMs": 500
113+
},
114+
{
115+
"dateUpdated": "Jan 29, 2016 4:24:20 AM",
116+
"config": {
117+
"colWidth": 12.0,
118+
"graph": {
119+
"mode": "table",
120+
"height": 300.0,
121+
"optionOpen": false,
122+
"keys": [],
123+
"values": [],
124+
"groups": [],
125+
"scatter": {}
126+
},
127+
"enabled": true,
128+
"editorMode": "ace/mode/scala",
129+
"tableHide": false
130+
},
131+
"settings": {
132+
"params": {},
133+
"forms": {}
134+
},
135+
"jobName": "paragraph_1454040082116_986786514",
136+
"id": "20160129-040122_1823942882",
137+
"dateCreated": "Jan 29, 2016 4:01:22 AM",
138+
"status": "READY",
139+
"errorMessage": "",
140+
"progressUpdateIntervalMs": 500
105141
}
106142
],
107-
"name": "Live Recs/02: Approximate Counts (Redis + Algebird)",
143+
"name": "Live Recs/02: Approximate HyperLogLog Count (Redis)",
108144
"id": "2B42HQF6Z",
109145
"angularObjects": {
110146
"2ARR8UZDJ": [],

data_persist/zeppelin/2B7RXDS6A/note.json

Lines changed: 44 additions & 44 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
0