10000 adding sparse matrix support · convexsetgithub/loco-lib@5bd0ae3 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5bd0ae3

Browse files
adding sparse matrix support
1 parent ec079f4 commit 5bd0ae3

File tree

10 files changed

+174
-67
lines changed

10 files changed

+174
-67
lines changed

LOCO/build.sbt

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,30 @@
11
name := "LOCO"
22

3-
version := "0.1"
3+
version := "0.1.1"
44

55
scalaVersion := "2.10.4"
66

77
// additional libraries
8+
{
89
libraryDependencies ++= Seq(
9-
"org.apache.spark" %% "spark-core" % "1.3.1" % "provided",
10-
"org.apache.spark" %% "spark-mllib" % "1.3.1",
10+
"org.apache.spark" %% "spark-core" % "1.4.1" % "provided",
11+
"org.apache.spark" %% "spark-mllib" % "1.4.1",
1112
"org.scalanlp" %% "breeze" % "0.11.2",
1213
"org.scalanlp" %% "breeze-natives" % "0.11.2",
1314
"cc.factorie" % "factorie_2.10" % "1.1.1",
1415
"com.github.fommil.netlib" % "all" % "1.1.2",
1516
"com.nativelibs4java" % "jnaerator" % "0.11",
1617
"net.java.dev.jna" % "jna" % "3.4.0"
1718
)
19+
}
1820

1921
resolvers ++= Seq(
2022
"IESL Release" at "http://dev-iesl.cs.umass.edu/nexus/content/groups/public",
2123
"Sonatype Releases" at "https://oss.sonatype.org/content/repositories/releases/"
2224
)
2325

2426
// Configure jar named used with the assembly plug-in
25-
assemblyJarName in assembly := "LOCO-assembly-0.1.jar"
27+
assemblyJarName in assembly := "LOCO-assembly-0.1.1.jar"
2628

2729
// assembly merge strategy
2830
assemblyMergeStrategy in assembly := {

LOCO/src/main/scala/driver.scala

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ object driver {
4242
// specify whether output shall be saved on HDFS
4343
val saveToHDFS = options.getOrElse("saveToHDFS", "false").toBoolean
4444
// how many partitions of the data matrix to use
45-
val nPartitions = options.getOrElse("nPartitions","4").toInt
45+
val nPartitions = options.getOrElse("nPartitions","8").toInt
4646
// how many executors are used
4747
val nExecutors = options.getOrElse(&qu DEC0 ot;nExecutors","4").toInt
4848

@@ -56,10 +56,10 @@ object driver {
5656
val separateTrainTestFiles = options.getOrElse("separateTrainTestFiles", "true").toBoolean
5757
// training input path
5858
val trainingDatafile =
59-
options.getOrElse("trainingDatafile", "../data/climate_train.txt")
59+
options.getOrElse("trainingDatafile", "../data/climate_pres_scaled_p2p3_12_train.txt")
6060
// test input path
6161
val testDatafile =
62-
options.getOrElse("testDatafile", "../data/climate_test.txt")
62+
options.getOrElse("testDatafile", "../data/climate_pres_scaled_p2p3_12_test.txt")
6363
// if only one file is provided, proportion used to test set
6464
val proportionTest = options.getOrElse("proportionTest", "0.2").toDouble
6565
// random seed
@@ -72,7 +72,7 @@ object driver {
7272
// use factorie or SDCA
7373
val optimizer = options.getOrElse("optimizer", "SDCA")
7474
// number of iterations used in SDCA
75-
val numIterations = options.getOrElse("numIterations", "5000").toInt
75+
val numIterations = options.getOrElse("numIterations", "20000").toInt
7676
// set duality gap as convergence criterion
7777
val stoppingDualityGap = options.getOrElse("stoppingDualityGap", "0.01").toDouble
7878
// specify whether duality gap as convergence criterion shall be used
@@ -86,20 +86,22 @@ object driver {
8686
val centerFeaturesOnly = options.getOrElse("centerFeaturesOnly", "false").toBoolean
8787
// specify projection (sparse or SDCT)
8888
val projection = options.getOrElse("projection", "sparse")
89+
// shall sparse data structures be used?
90+
val useSparseStructure = options.getOrElse("useSparseStructure", "false").toBoolean
8991
// specify flag for SDCT/FFTW: 64 corresponds to FFTW_ESTIMATE, 0 corresponds to FFTW_MEASURE
9092
val flagFFTW = options.getOrElse("flagFFTW", "64").toInt
9193
// specify projection dimension
92-
val nFeatsProj = options.getOrElse("nFeatsProj", "260").toInt
94+
val nFeatsProj = options.getOrElse("nFeatsProj", "200").toInt
9395
// concatenate or add
94-
val concatenate = options.getOrElse("concatenate", "true").toBoolean
96+
val concatenate = options.getOrElse("concatenate", "false").toBoolean
9597
// cross validation: "global", "local", or "none"
9698
val CVKind = options.getOrElse("CVKind", "none")
9799
// k for k-fold CV
98100
val kfold = options.getOrElse("kfold", "5").toInt
99101
// regularization parameter sequence start used in CV
100102
val lambdaSeqFrom = options.getOrElse("lambdaSeqFrom", "65").toDouble
101103
// regularization parameter sequence end used in CV
102-
val lambdaSeqTo = options.getOrElse("lambdaSeqTo", "80").toDouble
104+
val lambdaSeqTo = options.getOrElse("lambdaSeqTo", "66").toDouble
103105
// regularization parameter sequence step size used in CV
104106
val lambdaSeqBy = options.getOrElse("lambdaSeqBy", "1").toDouble
105107
// create lambda sequence
@@ -139,6 +141,7 @@ object driver {
139141
println("center: " + center)
140142
println("centerFeaturesOnly: " + centerFeaturesOnly)
141143
println("projection: " + projection)
144+
println("useSparseStructure: " + useSparseStructure)
142145
println("flagFFTW: " + flagFFTW)
143146
println("nFeatsProj: " + nFeatsProj)
144147
println("concatenate: " + concatenate)
@@ -207,7 +210,8 @@ object driver {
207210
if(CVKind == "global"){
208211
CVUtils.globalCV(
209212
sc, classification, myseed, training, center, centerFeaturesOnly, nPartitions,
210-
nExecutors, projection, flagFFTW, concatenate, nFeatsProj, lambdaSeq, kfold, optimizer,
213+
nExecutors, projection, flagFFTW, useSparseStructure,
214+
concatenate, nFeatsProj, lambdaSeq, kfold, optimizer,
211215
numIterations, checkDualityGap, stoppingDualityGap)
212216
}else{
213217
lambda
@@ -217,7 +221,8 @@ object driver {
217221
val (betaLoco, startTime, colMeans, meanResponse) =
218222
runLOCO.run(
219223
sc, classification, myseed, training, center, centerFeaturesOnly, nPartitions, nExecutors,
220-
projection, flagFFTW, concatenate, nFeatsProj, lambdaGlobal, CVKind, lambdaSeq, kfold,
224+
projection, flagFFTW, useSparseStructure,
225+
concatenate, nFeatsProj, lambdaGlobal, CVKind, lambdaSeq, kfold,
221226
optimizer, numIterations, checkDualityGap, stoppingDualityGap)
222227

223228
// get second timestamp needed to time LOCO and compute time difference
@@ -229,7 +234,8 @@ object driver {
229234
sc, classification, optimizer, numIterations, startTime, runTime,
230235
betaLoco, training, test, center, centerFeaturesOnly, meanResponse, colMeans, dataFormat,
231236
separateTrainTestFiles, trainingDatafile, testDatafile, dataFile, proportionTest, nPartitions,
232-
nExecutors, nFeatsProj, projection, flagFFTW, concatenate, lambda, CVKind, lambdaSeq, kfold,
237+
nExecutors, nFeatsProj, projection, flagFFTW, useSparseStructure,
238+
concatenate, lambda, CVKind, lambdaSeq, kfold,
233239
myseed, lambdaGlobal, checkDualityGap, stoppingDualityGap, saveToHDFS, directoryNameResultsFolder)
234240

235241
// compute end time of application and compute time needed overall

LOCO/src/main/scala/solvers/localDual.scala

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ object localDual {
3737
* @return
3838
*/
3939
def runLocalDualAdd(
40-
rawAndRandomFeatsWithIndex: (Int, (List[Int], DenseMatrix[Double], DenseMatrix[Double])),
41-
RPsAdded : DenseMatrix[Double],
40+
rawAndRandomFeatsWithIndex: (Int, (List[Int], Matrix[Double], Matrix[Double])),
41+
RPsAdded : Matrix[Double],
4242
response : Vector[Double],
4343
doCV : Boolean,
4444
kFold : Int,
@@ -66,8 +66,8 @@ object localDual {
6666

6767

6868
def runLocalDualConcatenate(
69-
rawFeatsWithIndex: (Int, (List[Int], DenseMatrix[Double])),
70-
RPsMap : collection.Map[Int, DenseMatrix[Double]],
69+
rawFeatsWithIndex: (Int, (List[Int], Matrix[Double])),
70+
RPsMap : collection.Map[Int, Matrix[Double]],
7171
response : Vector[Double],
7272
doCV : Boolean,
7373
k : Int,
@@ -96,8 +96,8 @@ object localDual {
9696

9797

9898
def runLocalDual(
99-
matrixWithIndex: (List[Int], DenseMatrix[Double]),
100-
randomMats : DenseMatrix[Double],
99+
matrixWithIndex: (List[Int], Matrix[Double]),
100+
randomMats : Matrix[Double],
101101
response : Vector[Double],
102102
doCV : Boolean,
103103
k : Int,
@@ -110,8 +110,11 @@ object localDual {
110110
checkDualityGap : Boolean,
111111
stoppingDualityGap : Double) : (List[Int], Vector[Double]) = {
112112

113+
// cast to dense matrix again
114+
val rawFeatures = matrixWithIndex._2.toDenseMatrix
115+
113116
// create design matrix by concatenating raw and random features
114-
val designMat = DenseMatrix.horzcat(matrixWithIndex._2, randomMats)
117+
val designMat = DenseMatrix.horzcat(rawFeatures, randomMats.toDenseMatrix)
115118

116119
// total number of features in local design matrix
117120
val numFeatures = designMat.cols
@@ -155,7 +158,7 @@ object localDual {
155158

156159
// map dual to primal variables and scale correctly
157160
val primalVariables : DenseMatrix[Double] =
158-
matrixWithIndex._2.t * new DenseMatrix(nObs, 1, alpha.toArray)
161+
rawFeatures.t * new DenseMatrix(nObs, 1, alpha.toArray)
159162
val scaling = 1.0/(nObs*min_lambda)
160163
val beta_hat = primalVariables.toDenseVector * scaling
161164

@@ -285,8 +288,8 @@ object localDual {
285288

286289

287290
def runLocalDualAdd_lambdaSeq(
288-
matrixWithIndex: (Int, (List[Int], DenseMatrix[Double], DenseMatrix[Double])),
289-
RPsAdded : DenseMatrix[Double],
291+
matrixWithIndex: (Int, (List[Int], Matrix[Double], Matrix[Double])),
292+
RPsAdded : Matrix[Double],
290293
response : Vector[Double],
291294
lambdaSeq : Seq[Double],
292295
nObs : Int,
@@ -312,8 +315,8 @@ object localDual {
312315

313316

314317
def runLocalDualConcatenate_lambdaSeq(
315-
matrixWithIndex: (Int, (List[Int], DenseMatrix[Double])),
316-
RPsMap : collection.Map[Int, DenseMatrix[Double]],
318+
matrixWithIndex: (Int, (List[Int], Matrix[Double])),
319+
RPsMap : collection.Map[Int, Matrix[Double]],
317320
response : Vector[Double],
318321
lambdaSeq : Seq[Double],
319322
nObs : Int,
@@ -339,8 +342,8 @@ object localDual {
339342

340343

341344
def runLocalDual_lambdaSeq(
342-
matrixWithIndex: (List[Int], DenseMatrix[Double]),
343-
randomMats: DenseMatrix[Double],
345+
matrixWithIndex: (List[Int], Matrix[Double]),
346+
randomMats: Matrix[Double],
344347
response : Vector[Double],
345348
lambdaSeq : Seq[Double],
346349
nObs : Int,
@@ -351,8 +354,11 @@ object localDual {
351354
checkDualityGap : Boolean,
352355
stoppingDualityGap : Double) : Seq[(Double, (List[Int], Vector[Double]))] = {
353356

357+
// cast to dense matrix
358+
val rawFeatures = matrixWithIndex._2.toDenseMatrix
359+
354360
// create design matrix by concatenating raw and random features
355-
val designMat = DenseMatrix.horzcat(matrixWithIndex._2, randomMats)
361+
val designMat = DenseMatrix.horzcat(rawFeatures, randomMats)
356362

357363
val numFeatures = designMat.cols
358364

@@ -380,7 +386,7 @@ object localDual {
380386
}
381387

382388
val primalVarsNotScaled : DenseMatrix[Double] =
383-
matrixWithIndex._2.t * new DenseMatrix(nObs, 1, alpha.toArray)
389+
rawFeatures.t * new DenseMatrix(nObs, 1, alpha.toArray)
384390
val scaling = 1.0/(nObs*currentLambda)
385391
val beta_hat = primalVarsNotScaled.toDenseVector * scaling
386392

LOCO/src/main/scala/solvers/localRidge.scala

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ import LOCO.utils.ProjectionUtils._
1010
object localRidge {
1111

1212
def runLocalRidgeRegressionAdd(
13-
matrixWithIndex: (Int, (List[Int], DenseMatrix[Double], DenseMatrix[Double])),
14-
10000 RPsAdded : DenseMatrix[Double],
13+
matrixWithIndex: (Int, (List[Int], Matrix[Double], Matrix[Double])),
14+
RPsAdded : Matrix[Double],
1515
response : Vector[Double],
1616
concatenate : Boolean,
1717
doCV : Boolean,
@@ -34,8 +34,8 @@ object localRidge {
3434

3535

3636
def runLocalRidgeRegressionConcatenate(
37-
matrixWithIndex: (Int, (List[Int], DenseMatrix[Double])),
38-
RPsMap : collection.Map[Int, DenseMatrix[Double]],
37+
matrixWithIndex: (Int, (List[Int], Matrix[Double])),
38+
RPsMap : collection.Map[Int, Matrix[Double]],
3939
response : Vector[Double],
4040
concatenate : Boolean,
4141
doCV : Boolean,
@@ -59,8 +59,8 @@ object localRidge {
5959

6060

6161
def runLocalRidgeRegression(
62-
matrixWithIndex:(List[Int], DenseMatrix[Double]),
63-
randomMats : DenseMatrix[Double],
62+
matrixWithIndex:(List[Int], Matrix[Double]),
63+
randomMats : Matrix[Double],
6464
response : Vector[Double],
6565
concatenate : Boolean,
6666
doCV : Boolean,
@@ -182,8 +182,8 @@ object localRidge {
182182

183183

184184
def runLocalRidgeRegressionAdd_lambdaSeq(
185-
matrixWithIndex: (Int, (List[Int], DenseMatrix[Double], DenseMatrix[Double])),
186-
RPsAdded : DenseMatrix[Double],
185+
matrixWithIndex: (Int, (List[Int], Matrix[Double], Matrix[Double])),
186+
RPsAdded : Matrix[Double],
187187
response : Vector[Double],
188188
concatenate : Boolean,
189189
lambdaSeq : Seq[Double],
@@ -203,8 +203,8 @@ object localRidge {
203203

204204

205205
def runLocalRidgeRegressionConcatenate_lambdaSeq(
206-
matrixWithIndex: (Int, (List[Int], DenseMatrix[Double])),
207-
RPsMap : collection.Map[Int, DenseMatrix[Double]],
206+
matrixWithIndex: (Int, (List[Int], Matrix[Double])),
207+
RPsMap : collection.Map[Int, Matrix[Double]],
208208
response : Vector[Double],
209209
concatenate : Boolean,
210210
lambdaSeq : Seq[Double],
@@ -225,8 +225,8 @@ object localRidge {
225225

226226

227227
def runLocalRidgeRegression_lambdaSeq(
228-
matrixWithIndex: (List[Int], DenseMatrix[Double]),
229-
randomMats : DenseMatrix[Double],
228+
matrixWithIndex: (List[Int], Matrix[Double]),
229+
randomMats : Matrix[Double],
230230
response : Vector[Double],
231231
concatenate : Boolean,
232232
lambdaSeq : Seq[Double],
@@ -237,8 +237,11 @@ object localRidge {
237237
// get number of raw features
238238
val size_raw = matrixWithIndex._1.length
239239

240+
// cast to dense matrix
241+
val rawFeatures = matrixWithIndex._2.toDenseMatrix
242+
240243
// create design matrix by concatenating raw and random features
241-
val designMat = DenseMatrix.horzcat(matrixWithIndex._2, randomMats)
244+
val designMat = DenseMatrix.horzcat(rawFeatures, randomMats)
242245

243246

244247
// create training set

LOCO/src/main/scala/solvers/runLOCO.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package LOCO.solvers
22

3-
import breeze.linalg.Vector
3+
import breeze.linalg.{DenseMatrix, CSCMatrix, Vector, Matrix}
44
import scala.collection._
55

66
import org.apache.spark.storage.StorageLevel
@@ -12,7 +12,6 @@ import preprocessingUtils.DataPoint
1212
import LOCO.utils.preprocessing
1313
import LOCO.utils.ProjectionUtils._
1414

15-
1615
object runLOCO {
1716

1817
/**
@@ -70,6 +69,7 @@ object runLOCO {
7069
nExecutors : Int,
7170
projection : String,
7271
flagFFTW : Int,
72+
useSparseStructure : Boolean,
7373
concatenate : Boolean,
7474
nFeatsProj : Int,
7575
lambda : Double,
@@ -102,9 +102,9 @@ object runLOCO {
102102
val t1 = System.currentTimeMillis
103103

104104
// project local matrices
105-
val rawAndRandomFeats =
106-
project(parsedDataByCol, projection, flagFFTW, concatenate, nFeatsProj, nObs, nFeats,
107-
myseed, nPartitions)
105+
val rawAndRandomFeats =
106+
project(parsedDataByCol, projection, flagFFTW, useSparseStructure,
107+
concatenate, nFeatsProj, nObs, nFeats, myseed, nPartitions)
108108

109109
// force evaluation of rawAndRandomFeats RDD and unpersist parsedDataByCol
110110
// (only needed for timing purposes)

LOCO/src/main/scala/utils/CVUtils.scala

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ object CVUtils {
6565
nExecutors : Int,
6666
projection : String,
6767
flagFFTW : Int,
68+
useSparseStructure : Boolean,
6869
concatenate : Boolean,
6970
nFeatsProj : Int,
7071
lambdaSeq : Seq[Double],
@@ -93,7 +94,8 @@ object CVUtils {
9394
val (lambdasAndCoefficientVectorsMap, colMeans, meanResponse) =
9495
runForLambdaSequence(
9596
sc, classification, seed, training, center, centerFeaturesOnly, nPartitions,
96-
nExecutors, projection, flagFFTW, concatenate, nFeatsProj, lambdaSeq, optimizer,
97+
nExecutors, projection, flagFFTW, useSparseStructure,
98+
concatenate, nFeatsProj, lambdaSeq, optimizer,
9799
numIterations, checkDualityGap, stoppingDualityGap)
98100

99101
// broadcast column means if features should be centered
@@ -196,6 +198,7 @@ object CVUtils {
196198
nExecutors : Int,
197199
projection : String,
198200
flagFFTW : Int,
201+
useSparseStructure : Boolean,
199202
concatenate : Boolean,
200203
nFeatsProj : Int,
201204
lambdaSeq : Seq[Double],
@@ -223,7 +226,9 @@ object CVUtils {
223226

224227
// project local matrices
225228
val rawAndRandomFeats =
226-
project(parsedDataByCol, projection, flagFFTW, concatenate, nFeatsProj, nObs, nFeats,
229+
project(
230+
parsedDataByCol, projection, flagFFTW, useSparseStructure,
231+
concatenate, nFeatsProj, nObs, nFeats,
227232
myseed, nPartitions)
228233

229234
rawAndRandomFeats.persist(StorageLevel.MEMORY_AND_DISK)

0 commit comments

Comments
 (0)
0