Following is the example -
Schema
|-- age: integer (nullable = true)
|-- workclass: string (nullable = true)
|-- fnlwgt: double (nullable = true)
|-- education: string (nullable = true)
|-- education-num: double (nullable = true)
|-- marital-status: string (nullable = true)
|-- occupation: string (nullable = true)
|-- relationship: string (nullable = true)
|-- race: string (nullable = true)
|-- sex: string (nullable = true)
|-- capital-gain: double (nullable = true)
|-- capital-loss: double (nullable = true)
|-- hours-per-week: double (nullable = true)
|-- native-country: string (nullable = true)
|-- income: string (nullable = true)
// Deal with Categorical Columns
// Transform string type columns to string indexer
val workclassIndexer = new StringIndexer().setInputCol("workclass").setOutputCol("workclassIndex")
val educationIndexer = new StringIndexer().setInputCol("education").setOutputCol("educationIndex")
val maritalStatusIndexer = new StringIndexer().setInputCol("marital-status").setOutputCol("maritalStatusIndex")
val occupationIndexer = new StringIndexer().setInputCol("occupation").setOutputCol("occupationIndex")
val relationshipIndexer = new StringIndexer().setInputCol("relationship").setOutputCol("relationshipIndex")
val raceIndexer = new StringIndexer().setInputCol("race").setOutputCol("raceIndex")
val sexIndexer = new StringIndexer().setInputCol("sex").setOutputCol("sexIndex")
val nativeCountryIndexer = new StringIndexer().setInputCol("native-country").setOutputCol("nativeCountryIndex")
val incomeIndexer = new StringIndexer().setInputCol("income").setOutputCol("incomeIndex")
// Transform string type columns to string indexer
val workclassEncoder = new OneHotEncoder().setInputCol("workclassIndex").setOutputCol("workclassVec")
val educationEncoder = new OneHotEncoder().setInputCol("educationIndex").setOutputCol("educationVec")
val maritalStatusEncoder = new OneHotEncoder().setInputCol("maritalStatusIndex").setOutputCol("maritalVec")
val occupationEncoder = new OneHotEncoder().setInputCol("occupationIndex").setOutputCol("occupationVec")
val relationshipEncoder = new OneHotEncoder().setInputCol("relationshipIndex").setOutputCol("relationshipVec")
val raceEncoder = new OneHotEncoder().setInputCol("raceIndex").setOutputCol("raceVec")
val sexEncoder = new OneHotEncoder().setInputCol("sexIndex").setOutputCol("sexVec")
val nativeCountryEncoder = new OneHotEncoder().setInputCol("nativeCountryIndex").setOutputCol("nativeCountryVec")
val incomeEncoder = new StringIndexer().setInputCol("incomeIndex").setOutputCol("label")
// Assemble everything together to be ("label","features") format
val assembler = (new VectorAssembler()
.setInputCols(Array("workclassVec", "fnlwgt", "educationVec", "education-num", "maritalVec", "occupationVec", "relationshipVec", "raceVec", "sexVec", "capital-gain", "capital-loss", "hours-per-week", "nativeCountryVec"))
.setOutputCol("features"))
///////////////////////////////
// Set Up the Pipeline ///////
/////////////////////////////
import org.apache.spark.ml.Pipeline
val lr = new LogisticRegression()
val pipeline = new Pipeline().setStages(Array(workclassIndexer, educationIndexer, maritalStatusIndexer, occupationIndexer, relationshipIndexer, raceIndexer, sexIndexer, nativeCountryIndexer, incomeIndexer, workclassEncoder, educationEncoder, maritalStatusEncoder, occupationEncoder, relationshipEncoder, raceEncoder, sexEncoder, nativeCountryEncoder, incomeEncoder, assembler, lr))
// Fit the pipeline to training documents.
val model = pipeline.fit(training)