8000 Merge branch 'master' into loco-lib-current · convexsetgithub/loco-lib@465eeb0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 465eeb0

Browse files
Merge branch 'master' into loco-lib-current
2 parents fb2cfce + bf08c14 commit 465eeb0

23 files changed

+1000
-3926
lines changed

LOCO/build.sbt

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,26 @@
11
name := "LOCO"
22

3-
version := "0.1"
3+
version := "0.2.0"
44

55
scalaVersion := "2.10.4"
66

77
// additional libraries
8+
{
89
libraryDependencies ++= Seq(
9-
"org.apache.spark" %% "spark-core" % "1.3.1" % "provided",
10-
"org.apache.spark" %% "spark-mllib" % "1.3.1",
10+
"org.apache.spark" %% "spark-core" % "1.5.1" % "provided",
11+
"org.apache.spark" %% "spark-mllib" % "1.5.1",
1112
"org.scalanlp" %% "breeze" % "0.11.2",
1213
"org.scalanlp" %% "breeze-natives" % "0.11.2",
13-
"cc.factorie" % "factorie_2.10" % "1.1.1",
14-
"com.github.fommil.netlib" % "all" % "1.1.2",
15-
"com.nativelibs4java" % "jnaerator" % "0.11",
16-
"net.java.dev.jna" % "jna" % "3.4.0"
17-
)
14+
"com.github.fommil.netlib" % "all" % "1.1.2" pomOnly())
15+
}
1816

1917
resolvers ++= Seq(
2018
"IESL Release" at "http://dev-iesl.cs.umass.edu/nexus/content/groups/public",
2119
"Sonatype Releases" at "https://oss.sonatype.org/content/repositories/releases/"
2220
)
2321

2422
// Configure jar named used with the assembly plug-in
25-
assemblyJarName in assembly := "LOCO-assembly-0.1.jar"
23+
assemblyJarName in assembly := "LOCO-assembly-0.2.0.jar"
2624

2725
// assembly merge strategy
2826
assemblyMergeStrategy in assembly := {

LOCO/project/build.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11

22
sbt.version=0.13.6
3+
javacOptions ++= Seq("-source", "1.7")

LOCO/run-LOCO-local.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ $SPARK_HOME/bin/spark-submit \
44
--class "LOCO.driver" \
55
--master local[4] \
66
--driver-memory 1G \
7-
target/scala-2.10/LOCO-assembly-0.1.jar \
7+
target/scala-2.10/LOCO-assembly-0.1.5.jar \
88
--classification=false \
99
--optimizer=SDCA \
1010
--numIterations=5000 \
@@ -14,7 +14,7 @@ target/scala-2.10/LOCO-assembly-0.1.jar \
1414
--trainingDatafile="../data/climate_train.txt" \
1515
--testDatafile="../data/climate_test.txt" \
1616
--center=true \
17-
--Proj=sparse \
17+
--projection=SDCT \
1818
--concatenate=true \
1919
--CVKind=none \
2020
--lambda=70 \

LOCO/src/main/java/fftw3/FFTW3Library.java

Lines changed: 0 additions & 2253 deletions
This file was deleted.

LOCO/src/main/scala/driver.scala

Lines changed: 86 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,23 @@
11
package LOCO
22

33

4+
import breeze.linalg.DenseVector
45
import org.apache.spark.rdd.RDD
6+
import org.apache.spark.storage.StorageLevel
57
import org.apache.spark.{SparkConf, SparkContext}
68

79
import org.apache.log4j.Logger
810
import org.apache.log4j.Level
911

10-
import preprocessingUtils.DataPoint
12+
import preprocessingUtils.FeatureVectorLP
1113
import preprocessingUtils.loadData.load
12-
import preprocessingUtils.loadData.load._
1314

1415
import LOCO.solvers.runLOCO
1516
import LOCO.utils.LOCOUtils._
1617
import LOCO.utils.CVUtils
1718

19+
import scala.io.Source
20+
1821

1922
object driver {
2023

@@ -44,107 +47,90 @@ object driver {
4447
// how many partitions of the data matrix to use
4548
val nPartitions = options.getOrElse("nPartitions","4").toInt
4649
// how many executors are used
47-
val nExecutors = options.getOrElse("nExecutors","4").toInt
48-
49-
// "text" or "object"
50-
val dataFormat = options.getOrElse("dataFormat", "text")
51-
// "libsvm", "spaces" or "comma"
52-
val textDataFormat = options.getOrElse("textDataFormat", "spaces")
53-
// input path
54-
val dataFile = options.getOrElse("dataFile", "../data/climate_train.txt")
55-
// provide training and test set as separate files?
56-
val separateTrainTestFiles = options.getOrElse("separateTrainTestFiles", "true").toBoolean
50+
val nExecutors = options.getOrElse("nExecutors","1").toInt
5751
// training input path
5852
val trainingDatafile =
59-
options.getOrElse("trainingDatafile", "../data/climate_train.txt")
53+
options.getOrElse("trainingDatafile", "../data/climate-serialized/climate-train-colwise/")
6054
// test input path
6155
val testDatafile =
62-
options.getOrElse("testDatafile", "../data/climate_test.txt")
63-
// if only one file is provided, proportion used to test set
64-
val proportionTest = options.getOrElse("proportionTest", "0.2").toDouble
56+
options.getOrElse("testDatafile", "../data/climate-serialized/climate-test-colwise/")
57+
// response vector - training
58+
val responsePathTrain =
59+
options.getOrElse("responsePathTrain", "../data/climate-serialized/climate-responseTrain.txt")
60+
// response vector - test
61+
val responsePathTest =
62+
options.getOrElse("responsePathTest", "../data/climate-serialized/climate-responseTest.txt")
63+
// number of features
64+
val nFeatsPath = options.getOrElse("nFeats", "../data/climate-serialized/climate-nFeats.txt")
6565
// random seed
66-
val myseed = options.getOrElse("seed", "3").toInt
66+
val randomSeed = options.getOrElse("seed", "3").toInt
67+
// shall sparse data structures be used?
68+
val useSparseStructure = options.getOrElse("useSparseStructure", "false").toBoolean
6769

6870
// 2) specify algorithm, loss function, and optimizer (if applicable)
6971

7072
// specify whether classification or ridge regression shall be used
7173
val classification = options.getOrElse("classification", "false").toBoolean
72-
// use factorie or SDCA
73-
val optimizer = options.getOrElse("optimizer", "SDCA")
7474
// number of iterations used in SDCA
75-
val numIterations = options.getOrElse("numIterations", "5000").toInt
75+
val numIterations = options.getOrElse("numIterations", "20000").toInt
7676
// set duality gap as convergence criterion
7777
val stoppingDualityGap = options.getOrElse("stoppingDualityGap", "0.01").toDouble
7878
// specify whether duality gap as convergence criterion shall be used
7979
val checkDualityGap = options.getOrElse("checkDualityGap", "false").toBoolean
8080

8181
// 3) algorithm-specific inputs
8282

83-
// center features and response
84-
val center = options.getOrElse("center", "true").toBoolean
85-
// center features only
86-
val centerFeaturesOnly = options.getOrElse("centerFeaturesOnly", "false").toBoolean
8783
// specify projection (sparse or SDCT)
88-
val projection = options.getOrElse("projection", "sparse")
89-
// specify flag for SDCT/FFTW: 64 corresponds to FFTW_ESTIMATE, 0 corresponds to FFTW_MEASURE
90-
val flagFFTW = options.getOrElse("flagFFTW", "64").toInt
84+
val projection = options.getOrElse("projection", "SDCT")
9185
// specify projection dimension
92-
val nFeatsProj = options.getOrElse("nFeatsProj", "260").toInt
86+
val nFeatsProj = options.getOrElse("nFeatsProj", "389").toInt
9387
// concatenate or add
94-
val concatenate = options.getOrElse("concatenate", "true").toBoolean
95-
// cross validation: "global", "local", or "none"
96-
val CVKind = options.getOrElse("CVKind", "none")
88+
val concatenate = options.getOrElse("concatenate", "false").toBoolean
89+
// cross validation
90+
val CV = options.getOrElse("CV", "false").toBoolean
9791
// k for k-fold CV
98-
val kfold = options.getOrElse("kfold", "5").toInt
92+
val kfold = options.getOrElse("kfold", "2").toInt
9993
// regularization parameter sequence start used in CV
100-
val lambdaSeqFrom = options.getOrElse("lambdaSeqFrom", "65").toDouble
94+
val lambdaSeqFrom = options.getOrElse("lambdaSeqFrom", "1").toDouble
10195
// regularization parameter sequence end used in CV
102-
val lambdaSeqTo = options.getOrElse("lambdaSeqTo", "80").toDouble
96+
val lambdaSeqTo = options.getOrElse("lambdaSeqTo", "10").toDouble
10397
// regularization parameter sequence step size used in CV
10498
val lambdaSeqBy = options.getOrElse("lambdaSeqBy", "1").toDouble
10599
// create lambda sequence
106100
val lambdaSeq = lambdaSeqFrom to lambdaSeqTo by lambdaSeqBy
107101
// regularization parameter to be used if CVKind == "none"
108-
val lambda = options.getOrElse("lambda", "70").toDouble
102+
val lambda = options.getOrElse("lambda", "95").toDouble
109103

110104
// print out inputs
111105
println("\nSpecify input and output options: ")
112-
println("dataFormat: " + dataFormat)
113-
if(dataFormat == "text"){
114-
println("textDataFormat: " + textDataFormat)
115-
}
116-
println("separateTrainTestFiles: " + separateTrainTestFiles)
117-
if(separateTrainTestFiles){
118-
println("trainingDatafile: " + trainingDatafile)
119-
println("testDatafile: " + testDatafile)
120-
}else {
121-
println("dataFile: " + dataFile)
122-
println("proportionTest: " + proportionTest)
123-
}
106+
107+
println("trainingDatafile: " + trainingDatafile)
108+
println("responsePathTrain: " + responsePathTrain)
109+
println("testDatafile: " + testDatafile)
110+
println("responsePathTest: " + responsePathTest)
111+
println("nFeatsPath: " + nFeatsPath)
112+
println("useSparseStructure: " + useSparseStructure)
113+
124114
println("outdir: " + outdir)
125115
println("saveToHDFS: " + saveToHDFS)
126-
println("seed: " + myseed)
116+
println("seed: " + randomSeed)
127117

128118
println("\nSpecify number of partitions, " +
129119
"algorithm, loss function, and optimizer (if applicable): ")
130120
println("nPartitions: " + nPartitions)
131121
println("nExecutors: " + nExecutors)
132122
println("classification: " + classification)
133-
println("optimizer: " + optimizer)
134123
println("numIterations: " + numIterations)
135124
println("checkDualityGap: " + checkDualityGap)
136125
println("stoppingDualityGap: " + stoppingDualityGap)
137126

138127
println("\nAlgorithm-specific inputs: ")
139-
println("center: " + center)
140-
println("centerFeaturesOnly: " + centerFeaturesOnly)
141128
println("projection: " + projection)
142-
println("flagFFTW: " + flagFFTW)
143129
println("nFeatsProj: " + nFeatsProj)
144130
println("concatenate: " + concatenate)
145-
println("CVKind: " + CVKind)
131+
println("CV: " + CV)
146132
println("kfold: " + kfold)
147-
if(CVKind != "none"){
133+
if(CV){
148134
println("lambdaSeq: " + lambdaSeq)
149135
}else{
150136
println("lambda: " + lambda)
@@ -175,62 +161,68 @@ object driver {
175161
Logger.getLogger("org").setLevel(Level.WARN)
176162
Logger.getLogger("akka").setLevel(Level.WARN)
177163

178-
// read in training and test data, distribute over rows
179-
val (training : RDD[DataPoint], test : RDD[DataPoint]) =
180-
dataFormat match {
181-
182-
// input files are text files
183-
case "text" => {
184-
val (training_temp, test_temp) =
185-
load.readTextFiles(
186-
sc, dataFile, nPartitions, textDataFormat, separateTrainTestFiles,
187-
trainingDatafile, testDatafile, proportionTest, myseed)
188-
189-
// convert RDD[Array(Double)] to RDD[DataPoint]
190-
(training_temp.map(x => doubleArrayToDataPoint(x)),
191-
test_temp.map(x => doubleArrayToDataPoint(x)))
192-
}
193-
194-
// input files are object files
195-
case "object" =>
196-
load.readObjectFiles[DataPoint](
197-
sc, dataFile, nPartitions, separateTrainTestFiles, trainingDatafile,
198-
testDatafile, proportionTest, myseed)
199-
200-
// throw exception if another option is given
201-
case _ => throw new Error("No such data format option (use text or object)!")
202-
}
164+
// read in training and test data, distributed over columns
165+
val (training : RDD[FeatureVectorLP], test : RDD[FeatureVectorLP]) =
166+
load.readObjectFiles[FeatureVectorLP](
167+
sc, null, nPartitions, true, trainingDatafile,
168+
testDatafile, 0.2, randomSeed)
203169

204-
// if cross validation is chosen to be "global", cross-validate
205-
// targeting the global prediction error
206-
val lambdaGlobal =
207-
if(CVKind == "global"){
170+
// repartition
171+
val trainingPartitioned: RDD[FeatureVectorLP] =
172+
training
173+
.repartition(nPartitions)
174+
.persist(StorageLevel.MEMORY_AND_DISK)
175+
176+
// force evaluation to allow for proper timing
177+
trainingPartitioned.foreach(x => {})
178+
179+
// read response vectors
180+
val responseTrain = DenseVector(load.readResponse(responsePathTrain).toArray)
181+
val responseTest = DenseVector(load.readResponse(responsePathTest).toArray)
182+
183+
// read number of features
184+
val nFeats = Source.fromFile(nFeatsPath).getLines().mkString.toInt
185+
186+
// start timing for cross validation
187+
val CVStart = System.currentTimeMillis()
188+
189+
// cross validation
190+
val lambdaCV =
191+
if(CV){
208192
CVUtils.globalCV(
209-
sc, classification, myseed, training, center, centerFeaturesOnly, nPartitions,
210-
nExecutors, projection, flagFFTW, concatenate, nFeatsProj, lambdaSeq, kfold, optimizer,
193+
sc, classification, randomSeed, trainingPartitioned, responseTrain, nFeats,
194+
nPartitions, nExecutors, projection, useSparseStructure,
195+
concatenate, nFeatsProj, lambdaSeq, kfold,
211196
numIterations, checkDualityGap, stoppingDualityGap)
212197
}else{
213198
lambda
214199
}
215200

201+
// stop timing for cross validation
202+
val CVTime = System.currentTimeMillis() - CVStart
203+
216204
// compute LOCO coefficients
217-
val (betaLoco, startTime, colMeans, meanResponse) =
205+
val (betaLoco, startTime, afterRPTime, afterCommTime) =
218206
runLOCO.run(
219-
sc, classification, myseed, training, center, centerFeaturesOnly, nPartitions, nExecutors,
220-
projection, flagFFTW, concatenate, nFeatsProj, lambdaGlobal, CVKind, lambdaSeq, kfold,
221-
optimizer, numIterations, checkDualityGap, stoppingDualityGap)
207+
sc, classification, randomSeed, trainingPartitioned, responseTrain, nFeats,
208+
nPartitions, nExecutors, projection, useSparseStructure,
209+
concatenate, nFeatsProj, lambdaCV,
210+
numIterations, checkDualityGap, stoppingDualityGap)
222211

223212
// get second timestamp needed to time LOCO and compute time difference
224213
val endTime = System.currentTimeMillis
225214
val runTime = endTime - startTime
215+
val RPTime = afterRPTime - startTime
216+
val communicationTime = afterCommTime - afterRPTime
217+
val restTime = runTime - RPTime - communicationTime
226218

227219
// print summary stats
228220
printSummaryStatistics(
229-
sc, classification, optimizer, numIterations, startTime, runTime,
230-
betaLoco, training, test, center, centerFeaturesOnly, meanResponse, colMeans, dataFormat,
231-
separateTrainTestFiles, trainingDatafile, testDatafile, dataFile, proportionTest, nPartitions,
232-
nExecutors, nFeatsProj, projection, flagFFTW, concatenate, lambda, CVKind, lambdaSeq, kfold,
233-
myseed, lambdaGlobal, checkDualityGap, stoppingDualityGap, saveToHDFS, directoryNameResultsFolder)
221+
sc, classification, numIterations, startTime, runTime, RPTime, communicationTime, restTime, CVTime,
222+
betaLoco, trainingPartitioned, test, responseTrain, responseTest, trainingDatafile, testDatafile,
223+
responsePathTrain, responsePathTest, nPartitions, nExecutors, nFeatsProj, projection,
224+
useSparseStructure, concatenate, lambda, CV, lambdaSeq, kfold, randomSeed, lambdaCV,
225+
checkDualityGap, stoppingDualityGap, saveToHDFS, directoryNameResultsFolder)
234226

235227
// compute end time of application and compute time needed overall
236228
val globalEndTime = System.currentTimeMillis

LOCO/src/main/scala/solvers/SDCA.scala

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,26 @@ import preprocessingUtils.DataPoint
77

88
object SDCA {
99
/**
10-
* This is an implementation of LocalDualMethod, here LocalSDCA (coordinate ascent),
11-
* with taking the information of the other workers into account, by respecting the
12-
* shared wInit vector.
13-
* Here we perform coordinate updates for the SVM dual objective (hinge loss).
10+
* Runs SDCA on local data matrix.
1411
*
1512
* Note that SDCA for hinge-loss is equivalent to LibLinear, where using the
1613
* regularization parameter C = 1.0/(lambda*numExamples), and re-scaling
1714
* the alpha variables with 1/C.
1815
*
19-
* @param localData the local data examples
16+
* @param localData Local data examples
17+
* @param response Response vector
2018
* @param localIters number of local coordinates to update
21-
* @param lambda
22-
* @param n
23-
* @param seed
24-
* @return deltaAlpha and deltaW, summarizing the performed local changes, see paper
19+
* @param lambda Regularization parameter
20+
* @param n Number of observations
21+
* @param seed Random seed
22+
* @param checkDualityGap Specify whether the duality gap should be computed after each iteration.
23+
* Note that this is expensive as it requires a pass over the entire (local)
24+
* data set. Should only be used for tuning purposes.
25+
* @param stoppingDualityGap Specify the size of the duality gap at which the optimization should
26+
* end. If it is not reached after numIterations, the optimization ends
27+
* nonetheless.
28+
*
29+
* @return alpha - dual solution vector
2530
*/
2631
def localSDCA(
2732
localData: DenseMatrix[Double],

0 commit comments

Comments
 (0)
0