0

Is there a way to import csv into cassandra through spark's java api without creating a pojo class for the csv. I am able to insert the csv by creating a pojo class like below , Is there any way to do so without creating pojo class for the csv programatically using spark java api.

My csv looks like this
Name,Age,bg,sex
ammar,67,ab+,M
nehan,88,b+,M
moin,99,m+,M
arbaaz,67,a+,M
...

And the program is below.

import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import com.cassandra.insertion.MergeGeneSymDataInsertion;
import com.cassandra.insertion.MergeGeneSymDataInsertion.HgIpsenGeneSym;
import com.publicdata.task.PublicDataInsertion.PublicData;
import static com.datastax.spark.connector.japi.CassandraJavaUtil.*;
public class InsertCsv {

static JavaSparkContext ctx = null;
static boolean isHeader = true;

public static void main(String[] args) {

    try {
        ctx = new JavaSparkContext(new SparkConf().setMaster("local[4]")
                .setAppName("TestCsvInserion"));
        insertCsv(ctx);
    } catch (Exception e) {
        e.printStackTrace();
    }

}

private static void insertCsv(JavaSparkContext ctx) {

    JavaRDD<String> testfileRdd = ctx
            .textFile("/home/syedammar/Pilot Project /test.csv");



    JavaRDD<Bats> batsclassRdd = testfileRdd
            .map(new Function<String, Bats>() {

                @Override
                public Bats call(String line) throws Exception {
                    // TODO Auto-generated method stub
                    if(!isHeader){


                    String[] words=StringUtils.split(line, ",");
                    String name = words[0];
                    String age = words[1];
                    String bg = words[2];
                    String sex = words[3];


                return new Bats(name, age, bg, sex);
                    }
                    else
                    {
                        isHeader=false;
                        return null;
                    }


                }
            }).filter(new Function<Bats, Boolean>() {

                @Override
                public Boolean call(Bats obj) throws Exception {
                    // TODO Auto-generated method stub

                    return obj!=null;
                }
            }).coalesce(1);

    javaFunctions(batsclassRdd).writerBuilder("test", "bats", mapToRow(Bats.class)).saveToCassandra();




}

public static class Bats {
    public Bats() {
        // TODO Auto-generated constructor stub
    }

    private String name;
    private String age;
    private String bg;
    public Bats(String name, String age, String bg, String sex) {
        super();
        this.name = name;
        this.age = age;
        this.bg = bg;
        this.sex = sex;
    }

    private String sex;

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getAge() {
        return age;
    }

    public void setAge(String age) {
        this.age = age;
    }

    public String getBg() {
        return bg;
    }

    public void setBg(String bg) {
        this.bg = bg;
    }

    public String getSex() {
        return sex;
    }

    public void setSex(String sex) {
        this.sex = sex;
    }

}

}

Syed Ammar Mustafa
  • 373
  • 1
  • 7
  • 18
  • I think what you want is spark-csv https://github.com/databricks/spark-csv – phact Sep 18 '15 at 14:12
  • I am not getting clarity seeing the link u sent. Could you give me an example (code) for the above csv template in the question to insert it to a cassandra table having same csv headers without creating pojo class through spark java api. Thanks – Syed Ammar Mustafa Sep 22 '15 at 09:51

1 Answers1

0

Yes you can do that. I found it while browsing... please refer - How to Parsing CSV or JSON File with Apache Spark There are two approaches, follow Procedure for approach B

POJO classes are not required for the approach B, but POJO classes would make your code easier to read if you are using Java

Hope this will help.

Community
  • 1
  • 1
Shobha
  • 1