iterative
diff --git a/‎example-get-started/analyze.py
Lines changed: 42 additions & 0 deletions b/‎example-get-started/analyze.py
Lines changed: 42 additions & 0 deletions
diff --git a/‎example-get-started/code/.github/workflows/cml.yaml
Lines changed: 6 additions & 1 deletion b/‎example-get-started/code/.github/workflows/cml.yaml
Lines changed: 6 additions & 1 deletion
diff --git a/‎example-get-started/code/README.md
Lines changed: 3 additions & 2 deletions b/‎example-get-started/code/README.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎example-get-started/code/params.yaml
Lines changed: 2 additions & 1 deletion b/‎example-get-started/code/params.yaml
Lines changed: 2 additions & 1 deletion
diff --git a/‎example-get-started/code/src/evaluate.py
Lines changed: 36 additions & 26 deletions b/‎example-get-started/code/src/evaluate.py
Lines changed: 36 additions & 26 deletions
diff --git a/‎example-get-started/code/src/featurization.py
Lines changed: 6 additions & 4 deletions b/‎example-get-started/code/src/featurization.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎example-get-started/code/src/prepare.py
Lines changed: 2 additions & 1 deletion b/‎example-get-started/code/src/prepare.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎example-get-started/code/src/requirements.txt
Lines changed: 2 additions & 0 deletions b/‎example-get-started/code/src/requirements.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎example-get-started/code/src/train.py
Lines changed: 1 addition & 1 deletion b/‎example-get-started/code/src/train.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎example-get-started/deploy.sh
Lines changed: 3 additions & 1 deletion b/‎example-get-started/deploy.sh
Lines changed: 3 additions & 1 deletion
@@ -0,0 +1,42 @@
+import io
+import os
+import random
+import re
+import sys
+import xml.etree.ElementTree
+
+
+if len(sys.argv) != 3:
+    sys.stderr.write("Arguments error. Usage:\n")
+    sys.stderr.write("\tpython analyze.py data-file output-file\n")
+    sys.exit(1)
+
+target = 40000
+split = 0.3
+
+
+def lines_matched_test(fd, test):
+    for line in fd:
+        try:
+            attr = xml.etree.ElementTree.fromstring(line).attrib
+            if test(attr.get("Tags", "")):
+                yield line
+        except Exception as ex:
+            sys.stderr.write(f"Skipping the broken line: {ex}\n")
+    
+
+def process_posts(fd_in, fd_not, fd_out):
+    count = 0
+    in_lines = lines_matched_test(fd_in, lambda x: "<r>" in x)
     not_lines = lines_matched_test(fd_not, lambda x: "<r>" not in x)
+    while count < target:
+        line = next(not_lines) if random.random() > split else next(in_lines)
+        fd_out.write(line)
+        count += 1
+
+
+with io.open(sys.argv[1], encoding="utf8") as fd_in:
+    with io.open(sys.argv[1], encoding="utf8") as fd_not:
+        with io.open(sys.argv[2], "w", encoding="utf8") as fd_out:
+            process_posts(fd_in, fd_not, fd_out)
+
@@ -23,10 +23,15 @@ jobs:
           echo "# CML Report" > report.md
           echo "## Plots" >> report.md
           dvc plots diff $PREVIOUS_REF workspace \
-            --show-vega --targets prc.json > vega.json
+            --show-vega --targets evaluation/plots/precision_recall.json > vega.json
           vl2svg vega.json prc.svg
           cml publish prc.svg --title "Precision & Recall" --md >> report.md
 
+          dvc plots show \
+            --show-vega evaluation/plots/predictions.json > vega.json
+          vl2svg vega.json confusion.svg
+          cml publish confusion.svg --title "Confusion Matrix" --md >> report.md
+
           echo "## Metrics" >> report.md
           echo "### $PREVIOUS_REF → workspace" >> report.md
           dvc metrics diff $PREVIOUS_REF --show-md >> report.md
 
@@ -8,8 +8,8 @@ introduction into basic DVC concepts.
 
 The project is a natural language processing (NLP) binary classifier problem of
 predicting tags for a given StackOverflow question. For example, we want one
-classifier which can predict a post that is about the Python language by tagging
-it `python`.
+classifier which can predict a post that is about the R language by tagging it
+`R`.
 
 🐛 Please report any issues found in this project here -
 [example-repos-dev](https://github.com/iterative/example-repos-dev).
@@ -160,3 +160,4 @@ $ tree
     ├── requirements.txt  # <-- Python dependencies needed in the project
     └── train.py
 ```
+
@@ -3,10 +3,11 @@ prepare:
   seed: 20170428
 
 featurize:
-  max_features: 500
+  max_features: 100
   ngrams: 1
 
 train:
   seed: 20170428
   n_est: 50
   min_split: 2
+
@@ -4,45 +4,48 @@
 import pickle
 import sys
 
-import sklearn.metrics as metrics
+import pandas as pd
+from sklearn import metrics
+from sklearn import tree
+from dvclive import Live
+from matplotlib import pyplot as plt
 
-if len(sys.argv) != 6:
+
+live = Live("evaluation")
+
+if len(sys.argv) != 3:
     sys.stderr.write("Arguments error. Usage:\n")
-    sys.stderr.write("\tpython evaluate.py model features scores prc roc\n")
+    sys.stderr.write("\tpython evaluate.py model features\n")
     sys.exit(1)
 
 model_file = sys.argv[1]
 matrix_file = os.path.join(sys.argv[2], "test.pkl")
-scores_file = sys.argv[3]
-prc_file = sys.argv[4]
-roc_file = sys.argv[5]
 
 with open(model_file, "rb") as fd:
     model = pickle.load(fd)
 
 with open(matrix_file, "rb") as fd:
-    matrix = pickle.load(fd)
+    matrix, feature_names = pickle.load(fd)
 
 labels = matrix[:, 1].toarray()
 x = matrix[:, 2:]
 
 predictions_by_class = model.predict_proba(x)
 predictions = predictions_by_class[:, 1]
 
-precision, recall, prc_thresholds = metrics.precision_recall_curve(labels, predictions)
-fpr, tpr, roc_thresholds = metrics.roc_curve(labels, predictions)
-
-avg_prec = metrics.average_precision_score(labels, predictions)
-roc_auc = metrics.roc_auc_score(labels, predictions)
-
-with open(scores_file, "w") as fd:
-    json.dump({"avg_prec": avg_prec, "roc_auc": roc_auc}, fd, indent=4)
+# Use dvclive to log a few simple plots ...
+live.log_plot("roc", labels, predictions)
+live.log("avg_prec", metrics.average_precision_score(labels, predictions))
+live.log("roc_auc", metrics.roc_auc_score(labels, predictions))
 
+# ... but actually it can be done with dumping data points into a file:
 # ROC has a drop_intermediate arg that reduces the number of points.
 # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve.
 # PRC lacks this arg, so we manually reduce to 1000 points as a rough estimate.
+precision, recall, prc_thresholds = metrics.precision_recall_curve(labels, predictions)
 nth_point = math.ceil(len(prc_thresholds) / 1000)
 prc_points = list(zip(precision, recall, prc_thresholds))[::nth_point]
+prc_file = "evaluation/plots/precision_recall.json"
 with open(prc_file, "w") as fd:
     json.dump(
         {
@@ -55,14 +58,21 @@
         indent=4,
     )
 
-with open(roc_file, "w") as fd:
-    json.dump(
-        {
-            "roc": [
-                {"fpr": fp, "tpr": tp, "threshold": t}
-                for fp, tp, t in zip(fpr, tpr, roc_thresholds)
-            ]
-        },
-        fd,
-        indent=4,
-    )
+
+# ... confusion matrix plot
+predictions = [{
+                "actual": int(actual),
+                "predicted": 1 if predicted > 0.5 else 0
+               } for actual, predicted in zip(labels, predictions)]
+with open("evaluation/plots/predictions.json", "w") as f:
+    json.dump(predictions, f)
+
+# ... and finally, we can dump an image, it's also supported:
+fig, axes = plt.subplots(dpi=800)
+fig.subplots_adjust(bottom=0.2, top=0.95)
+importances = model.feature_importances_
+forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
+axes.set_ylabel("Mean decrease in impurity")
+forest_importances.plot.bar(ax=axes)
+fig.savefig('evaluation/importance.png')
+
@@ -38,7 +38,7 @@ def get_df(data):
     return df
 
 
-def save_matrix(df, matrix, output):
+def save_matrix(df, matrix, names, output):
     id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
     label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
 
@@ -48,7 +48,7 @@ def save_matrix(df, matrix, output):
     sys.stderr.write(msg.format(output, result.shape, result.dtype))
 
     with open(output, "wb") as fd:
-        pickle.dump(result, fd)
+        pickle.dump((result, names), fd)
     pass
 
 
@@ -64,16 +64,18 @@ def save_matrix(df, matrix, output):
 
 bag_of_words.fit(train_words)
 train_words_binary_matrix = bag_of_words.transform(train_words)
+feature_names = bag_of_words.get_feature_names_out()
 tfidf = TfidfTransformer(smooth_idf=False)
 tfidf.fit(train_words_binary_matrix)
 train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
 
-save_matrix(df_train, train_words_tfidf_matrix, train_output)
+save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)
 
 # Generate test feature matrix
 df_test = get_df(test_input)
 test_words = np.array(df_test.text.str.lower().values.astype("U"))
 test_words_binary_matrix = bag_of_words.transform(test_words)
 test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
 
-save_matrix(df_test, test_words_tfidf_matrix, test_output)
+save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
+
@@ -48,4 +48,5 @@ def process_posts(fd_in, fd_out_train, fd_out_test, target_tag):
 with io.open(input, encoding="utf8") as fd_in:
     with io.open(output_train, "w", encoding="utf8") as fd_out_train:
         with io.open(output_test, "w", encoding="utf8") as fd_out_test:
-            process_posts(fd_in, fd_out_train, fd_out_test, "<python>")
+            process_posts(fd_in, fd_out_train, fd_out_test, "<r>")
+
@@ -1,4 +1,6 @@
+dvclive
 pandas
 pyaml
 scikit-learn
 scipy
+matplotlib
@@ -20,7 +20,7 @@
 min_split = params["min_split"]
 
 with open(os.path.join(input, "train.pkl"), "rb") as fd:
-    matrix = pickle.load(fd)
+    matrix, _ = pickle.load(fd)
 
 labels = np.squeeze(matrix[:, 1].toarray())
 x = matrix[:, 2:]
 
@@ -17,7 +17,9 @@ popd
 
 # Requires AWS CLI and write access to `s3://dvc-public/code/get-started/`.
 mv $PACKAGE_DIR/$PACKAGE .
-aws s3 cp --acl public-read $PACKAGE s3://dvc-public/code/get-started/$PACKAGE
+#aws s3 cp --acl public-read $PACKAGE s3://dvc-public/code/get-started/$PACKAGE
+
+exit
 
 # Sanity check
 wget https://code.dvc.org/get-started/$PACKAGE -O $TEST_PACKAGE
-Original file line number
+Diff line change
@@ @@ -1,4 +1,6 @@ @@
 +dvclive
 pandas
 pyaml
 scikit-learn
 scipy
 +matplotlib