convexsetgithub
diff --git a/‎LOCO/build.sbt
Lines changed: 7 additions & 9 deletions b/‎LOCO/build.sbt
Lines changed: 7 additions & 9 deletions
diff --git a/‎LOCO/project/build.properties
Lines changed: 1 addition & 0 deletions b/‎LOCO/project/build.properties
Lines changed: 1 addition & 0 deletions
diff --git a/‎LOCO/run-LOCO-local.sh
Lines changed: 2 additions & 2 deletions b/‎LOCO/run-LOCO-local.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎LOCO/src/main/java/fftw3/FFTW3Library.java
Lines changed: 0 additions & 2253 deletions b/‎LOCO/src/main/java/fftw3/FFTW3Library.java
Lines changed: 0 additions & 2253 deletions
diff --git a/‎LOCO/src/main/scala/driver.scala
Lines changed: 86 additions & 94 deletions b/‎LOCO/src/main/scala/driver.scala
Lines changed: 86 additions & 94 deletions
diff --git a/‎LOCO/src/main/scala/solvers/SDCA.scala
Lines changed: 14 additions & 9 deletions b/‎LOCO/src/main/scala/solvers/SDCA.scala
Lines changed: 14 additions & 9 deletions
@@ -1,28 +1,26 @@
 name := "LOCO"
 
-version := "0.1"
+version := "0.2.0"
 
 scalaVersion := "2.10.4"
 
 // additional libraries
+{
 libraryDependencies ++= Seq(
-  "org.apache.spark" %% "spark-core" % "1.3.1" % "provided",
-  "org.apache.spark"  %% "spark-mllib" % "1.3.1",
+  "org.apache.spark" %% "spark-core" % "1.5.1" % "provided",
+  "org.apache.spark"  %% "spark-mllib" % "1.5.1",
   "org.scalanlp" %% "breeze" % "0.11.2",
   "org.scalanlp" %% "breeze-natives" % "0.11.2",
-  "cc.factorie" % "factorie_2.10" % "1.1.1",
-  "com.github.fommil.netlib" % "all" % "1.1.2",
-  "com.nativelibs4java" % "jnaerator" % "0.11",
-  "net.java.dev.jna" % "jna" % "3.4.0"
-)
+  "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly())
+}
 
 resolvers ++= Seq(
   "IESL Release" at "http://dev-iesl.cs.umass.edu/nexus/content/groups/public",
    "Sonatype Releases" at "https://oss.sonatype.org/content/repositories/releases/"
 )
 
 // Configure jar named used with the assembly plug-in
-assemblyJarName in assembly := "LOCO-assembly-0.1.jar"
+assemblyJarName in assembly := "LOCO-assembly-0.2.0.jar"
 
 // assembly merge strategy
 assemblyMergeStrategy in assembly := {
 
@@ -1,2 +1,3 @@
 
 sbt.version=0.13.6
+javacOptions ++= Seq("-source", "1.7")
@@ -4,7 +4,7 @@ $SPARK_HOME/bin/spark-submit \
 --class "LOCO.driver" \
 --master local[4] \
 --driver-memory 1G \
-target/scala-2.10/LOCO-assembly-0.1.jar \
+target/scala-2.10/LOCO-assembly-0.1.5.jar \
 --classification=false \
 --optimizer=SDCA \
 --numIterations=5000 \
@@ -14,7 +14,7 @@ target/scala-2.10/LOCO-assembly-0.1.jar \
 --trainingDatafile="../data/climate_train.txt" \
 --testDatafile="../data/climate_test.txt" \
 --center=true \
---Proj=sparse \
+--projection=SDCT \
 --concatenate=true \
 --CVKind=none \
 --lambda=70 \
 
@@ -1,20 +1,23 @@
 package LOCO
 
 
+import breeze.linalg.DenseVector
 import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
 import org.apache.spark.{SparkConf, SparkContext}
 
 import org.apache.log4j.Logger
 import org.apache.log4j.Level
 
-import preprocessingUtils.DataPoint
+import preprocessingUtils.FeatureVectorLP
 import preprocessingUtils.loadData.load
-import preprocessingUtils.loadData.load._
 
 import LOCO.solvers.runLOCO
 import LOCO.utils.LOCOUtils._
 import LOCO.utils.CVUtils
 
+import scala.io.Source
+
 
 object driver {
 
@@ -44,107 +47,90 @@ object driver {
     // how many partitions of the data matrix to use
     val nPartitions = options.getOrElse("nPartitions","4").toInt
     // how many executors are used
-    val nExecutors = options.getOrElse("nExecutors","4").toInt
-
-    // "text" or "object"
-    val dataFormat = options.getOrElse("dataFormat", "text")
-    // "libsvm", "spaces" or "comma"
-    val textDataFormat = options.getOrElse("textDataFormat", "spaces")
-    // input path
-    val dataFile = options.getOrElse("dataFile", "../data/climate_train.txt")
-    // provide training and test set as separate files?
-    val separateTrainTestFiles = options.getOrElse("separateTrainTestFiles", "true").toBoolean
+    val nExecutors = options.getOrElse("nExecutors","1").toInt
     // training input path
     val trainingDatafile =
-      options.getOrElse("trainingDatafile", "../data/climate_train.txt")
+      options.getOrElse("trainingDatafile", "../data/climate-serialized/climate-train-colwise/")
     // test input path
     val testDatafile =
-      options.getOrElse("testDatafile", "../data/climate_test.txt")
-    // if only one file is provided, proportion used to test set
-    val proportionTest = options.getOrElse("proportionTest", "0.2").toDouble
+        options.getOrElse("testDatafile", "../data/climate-serialized/climate-test-colwise/")
+    // response vector - training
+    val responsePathTrain =
+    options.getOrElse("responsePathTrain", "../data/climate-serialized/climate-responseTrain.txt")
+    // response vector - test
+    val responsePathTest =
+      options.getOrElse("responsePathTest", "../data/climate-serialized/climate-responseTest.txt")
+    // number of features
+    val nFeatsPath = options.getOrElse("nFeats", "../data/climate-serialized/climate-nFeats.txt")
     // random seed
-    val myseed = options.getOrElse("seed", "3").toInt
+    val randomSeed = options.getOrElse("seed", "3").toInt
+    // shall sparse data structures be used?
+    val useSparseStructure = options.getOrElse("useSparseStructure", "false").toBoolean
 
     // 2) specify algorithm, loss function, and optimizer (if applicable)
 
     // specify whether classification or ridge regression shall be used
     val classification = options.getOrElse("classification", "false").toBoolean
-    // use factorie or SDCA
-    val optimizer = options.getOrElse("optimizer", "SDCA")
     // number of iterations used in SDCA
-    val numIterations = options.getOrElse("numIterations", "5000").toInt
+    val numIterations = options.getOrElse("numIterations", "20000").toInt
     // set duality gap as convergence criterion
     val stoppingDualityGap = options.getOrElse("stoppingDualityGap", "0.01").toDouble
     // specify whether duality gap as convergence criterion shall be used
     val checkDualityGap = options.getOrElse("checkDualityGap", "false").toBoolean
 
     // 3) algorithm-specific inputs
 
-    // center features and response
-    val center = options.getOrElse("center", "true").toBoolean
-    // center features only
-    val centerFeaturesOnly = options.getOrElse("centerFeaturesOnly", "false").toBoolean
     // specify projection (sparse or SDCT)
-    val projection = options.getOrElse("projection", "sparse")
-    // specify flag for SDCT/FFTW: 64 corresponds to FFTW_ESTIMATE, 0 corresponds to FFTW_MEASURE
-    val flagFFTW = options.getOrElse("flagFFTW", "64").toInt
+    val projection = options.getOrElse("projection", "SDCT")
     // specify projection dimension
-    val nFeatsProj = options.getOrElse("nFeatsProj", "260").toInt
+    val nFeatsProj = options.getOrElse("nFeatsProj", "389").toInt
     // concatenate or add
-    val concatenate = options.getOrElse("concatenate", "true").toBoolean
-    // cross validation: "global", "local", or "none"
-    val CVKind = options.getOrElse("CVKind", "none")
+    val concatenate = options.getOrElse("concatenate", "false").toBoolean
+    // cross validation
+    val CV = options.getOrElse("CV", "false").toBoolean
     // k for k-fold CV
-    val kfold = options.getOrElse("kfold", "5").toInt
+    val kfold = options.getOrElse("kfold", "2").toInt
     // regularization parameter sequence start used in CV
-    val lambdaSeqFrom = options.getOrElse("lambdaSeqFrom", "65").toDouble
+    val lambdaSeqFrom = options.getOrElse("lambdaSeqFrom", "1").toDouble
     // regularization parameter sequence end used in CV
-    val lambdaSeqTo = options.getOrElse("lambdaSeqTo", "80").toDouble
+    val lambdaSeqTo = options.getOrElse("lambdaSeqTo", "10").toDouble
     // regularization parameter sequence step size used in CV
     val lambdaSeqBy = options.getOrElse("lambdaSeqBy", "1").toDouble
     // create lambda sequence
     val lambdaSeq = lambdaSeqFrom to lambdaSeqTo by lambdaSeqBy
     // regularization parameter to be used if CVKind == "none"
-    val lambda = options.getOrElse("lambda", "70").toDouble
+    val lambda = options.getOrElse("lambda", "95").toDouble
 
     // print out inputs
     println("\nSpecify input and output options: ")
-    println("dataFormat:                 " + dataFormat)
-    if(dataFormat == "text"){
-      println("textDataFormat:             " + textDataFormat)
-    }
-    println("separateTrainTestFiles:     " + separateTrainTestFiles)
-    if(separateTrainTestFiles){
-      println("trainingDatafile:           " + trainingDatafile)
-      println("testDatafile:               " + testDatafile)
-    }else {
-      println("dataFile:                   " + dataFile)
-      println("proportionTest:             " + proportionTest)
-    }
+
+    println("trainingDatafile:           " + trainingDatafile)
+    println("responsePathTrain:          " + responsePathTrain)
+    println("testDatafile:               " + testDatafile)
+    println("responsePathTest:           " + responsePathTest)
+    println("nFeatsPath:                 " + nFeatsPath)
+    println("useSparseStructure:         " + useSparseStructure)
+
     println("outdir:                     " + outdir)
     println("saveToHDFS:                 " + saveToHDFS)
-    println("seed:                       " + myseed)
+    println("seed:                       " + randomSeed)
 
     println("\nSpecify number of partitions, " +
       "algorithm, loss function, and optimizer (if applicable): ")
     println("nPartitions:                " + nPartitions)
     println("nExecutors:                 " + nExecutors)
     println("classification:             " + classification)
-    println("optimizer:                  " + optimizer)
     println("numIterations:              " + numIterations)
     println("checkDualityGap:            " + checkDualityGap)
     println("stoppingDualityGap:         " + stoppingDualityGap)
 
     println("\nAlgorithm-specific inputs: ")
-    println("center:                     " + center)
-    println("centerFeaturesOnly:         " + centerFeaturesOnly)
     println("projection:                 " + projection)
-    println("flagFFTW:                   " + flagFFTW)
     println("nFeatsProj:                 " + nFeatsProj)
     println("concatenate:                " + concatenate)
-    println("CVKind:                     " + CVKind)
+    println("CV:                         " + CV)
     println("kfold:                      " + kfold)
-    if(CVKind != "none"){
+    if(CV){
       println("lambdaSeq:                  " + lambdaSeq)
     }else{
       println("lambda:                     " + lambda)
@@ -175,62 +161,68 @@ object driver {
     Logger.getLogger("org").setLevel(Level.WARN)
     Logger.getLogger("akka").setLevel(Level.WARN)
 
-    // read in training and test data, distribute over rows
-    val (training : RDD[DataPoint], test : RDD[DataPoint]) =
-      dataFormat match {
-
-        // input files are text files
-        case "text" => {
-          val (training_temp, test_temp) =
-            load.readTextFiles(
-              sc, dataFile, nPartitions, textDataFormat, separateTrainTestFiles,
-              trainingDatafile, testDatafile, proportionTest, myseed)
-
-          // convert RDD[Array(Double)] to RDD[DataPoint]
-          (training_temp.map(x => doubleArrayToDataPoint(x)),
-            test_temp.map(x => doubleArrayToDataPoint(x)))
-        }
-
-        // input files are object files
-        case "object" =>
-          load.readObjectFiles[DataPoint](
-            sc, dataFile, nPartitions, separateTrainTestFiles, trainingDatafile,
-            testDatafile, proportionTest, myseed)
-
-        // throw exception if another option is given
-        case _ => throw new Error("No such data format option (use text or object)!")
-      }
+    // read in training and test data, distributed over columns
+    val (training : RDD[FeatureVectorLP], test : RDD[FeatureVectorLP]) =
+       load.readObjectFiles[FeatureVectorLP](
+            sc, null, nPartitions, true, trainingDatafile,
+            testDatafile, 0.2, randomSeed)
 
-    // if cross validation is chosen to be "global", cross-validate
-    // targeting the global prediction error
-    val lambdaGlobal =
-      if(CVKind == "global"){
+    // repartition
+    val trainingPartitioned: RDD[FeatureVectorLP] =
+      training
+        .repartition(nPartitions)
+        .persist(StorageLevel.MEMORY_AND_DISK)
+
+    // force evaluation to allow for proper timing
+    trainingPartitioned.foreach(x => {})
+
+    // read response vectors
+    val responseTrain = DenseVector(load.readResponse(responsePathTrain).toArray)
+    val responseTest = DenseVector(load.readResponse(responsePathTest).toArray)
+
+    // read number of features
+    val nFeats = Source.fromFile(nFeatsPath).getLines().mkString.toInt
+
+    // start timing for cross validation
+    val CVStart = System.currentTimeMillis()
+
+    // cross validation
+    val lambdaCV =
+      if(CV){
         CVUtils.globalCV(
-          sc, classification, myseed, training,  center, centerFeaturesOnly, nPartitions,
-          nExecutors, projection, flagFFTW, concatenate, nFeatsProj, lambdaSeq, kfold, optimizer,
+          sc, classification, randomSeed, trainingPartitioned, responseTrain, nFeats,
+          nPartitions, nExecutors, projection, useSparseStructure,
+          concatenate, nFeatsProj, lambdaSeq, kfold,
           numIterations, checkDualityGap, stoppingDualityGap)
       }else{
         lambda
       }
 
+    // stop timing for cross validation
+    val CVTime = System.currentTimeMillis() - CVStart
+
     // compute LOCO coefficients
-    val (betaLoco, startTime, colMeans, meanResponse) =
+    val (betaLoco, startTime, afterRPTime, afterCommTime) =
       runLOCO.run(
-        sc, classification, myseed, training, center, centerFeaturesOnly, nPartitions, nExecutors,
-        projection, flagFFTW, concatenate, nFeatsProj, lambdaGlobal, CVKind, lambdaSeq, kfold,
-        optimizer, numIterations, checkDualityGap, stoppingDualityGap)
+        sc, classification, randomSeed, trainingPartitioned, responseTrain, nFeats,
+        nPartitions, nExecutors, projection, useSparseStructure,
+        concatenate, nFeatsProj, lambdaCV,
+        numIterations, checkDualityGap, stoppingDualityGap)
 
     // get second timestamp needed to time LOCO and compute time difference
     val endTime = System.currentTimeMillis
     val runTime = endTime - startTime
+    val RPTime = afterRPTime - startTime
+    val communicationTime = afterCommTime - afterRPTime
+    val restTime = runTime - RPTime - communicationTime
 
     // print summary stats
     printSummaryStatistics(
-      sc, classification, optimizer, numIterations, startTime, runTime,
-      betaLoco, training, test, center, centerFeaturesOnly, meanResponse, colMeans, dataFormat,
-      separateTrainTestFiles, trainingDatafile, testDatafile, dataFile, proportionTest, nPartitions,
-      nExecutors, nFeatsProj, projection, flagFFTW, concatenate, lambda, CVKind, lambdaSeq, kfold,
-      myseed, lambdaGlobal, checkDualityGap, stoppingDualityGap, saveToHDFS, directoryNameResultsFolder)
+      sc, classification, numIterations, startTime, runTime, RPTime, communicationTime, restTime, CVTime,
+      betaLoco, trainingPartitioned, test, responseTrain, responseTest, trainingDatafile, testDatafile,
+      responsePathTrain, responsePathTest, nPartitions, nExecutors, nFeatsProj, projection,
+      useSparseStructure, concatenate, lambda, CV, lambdaSeq, kfold, randomSeed, lambdaCV,
+      checkDualityGap, stoppingDualityGap, saveToHDFS, directoryNameResultsFolder)
 
     // compute end time of application and compute time needed overall
     val globalEndTime = System.currentTimeMillis
 
@@ -7,21 +7,26 @@ import preprocessingUtils.DataPoint
 
 object SDCA {
   /**
-   * This is an implementation of LocalDualMethod, here LocalSDCA (coordinate ascent),
-   * with taking the information of the other workers into account, by respecting the
-   * shared wInit vector.
-   * Here we perform coordinate updates for the SVM dual objective (hinge loss).
+   * Runs SDCA on local data matrix.
    *
    * Note that SDCA for hinge-loss is equivalent to LibLinear, where using the
    * regularization parameter  C = 1.0/(lambda*numExamples), and re-scaling
    * the alpha variables with 1/C.
    *
-   * @param localData the local data examples
+   * @param localData Local data examples
+   * @param response Response vector
    * @param localIters number of local coordinates to update
-   * @param lambda
-   * @param n
-   * @param seed
-   * @return deltaAlpha and deltaW, summarizing the performed local changes, see paper
+   * @param lambda Regularization parameter
+   * @param n Number of observations
+   * @param seed Random seed
+   * @param checkDualityGap Specify whether the duality gap should be computed after each iteration.
+   *                        Note that this is expensive as it requires a pass over the entire (local)
+   *                        data set. Should only be used for tuning purposes.
+   * @param stoppingDualityGap Specify the size of the duality gap at which the optimization should
+   *                           end. If it is not reached after numIterations, the optimization ends
+   *                           nonetheless.
+   *
+   * @return alpha - dual solution vector
    */
   def localSDCA(
       localData: DenseMatrix[Double],
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`
`2`	`2`	`sbt.version=0.13.6`
	`3`	`+javacOptions ++= Seq("-source", "1.7")`