1

I'm trying to create a pipeline task using beam java SDK and Google Dataflow to move data from Cloud SQL to Elastic search

I've created the following class main method:

 public static void main(String[] args) throws Exception{
    DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
    options.setProject("staging");      
    options.setTempLocation("gs://csv_to_sql_staging/temp"); 
    options.setRunner(DataflowRunner.class);        
    options.setGcpTempLocation("gs://csv_to_sql_staging/temp");
    options.setUsePublicIps(false);         
    options.setJobName("tamer-new");
    options.setSubnetwork("regions/us-central1/subnetworks/new-network");
    final List<String> SCOPES = Arrays.asList(
      "https://www.googleapis.com/auth/cloud-platform",
      "https://www.googleapis.com/auth/devstorage.full_control",
      "https://www.googleapis.com/auth/userinfo.email",
      "https://www.googleapis.com/auth/datastore",
      "https://www.googleapis.com/auth/sqlservice.admin",
      "https://www.googleapis.com/auth/pubsub");
    options.setGcpCredential(ServiceAccountCredentials.fromStream(new ElasticSearchIO().getClass().getResourceAsStream("/staging-b648da5d2b9b.json")).createScoped(SCOPES));            options.setServiceAccount("data-flow@staging.iam.gserviceaccount.com");
                    
    Pipeline p = Pipeline.create(options);
                      
    p.begin();
                      
    PCollection < List < String >> rows = p.apply(JdbcIO. < List < String >> read().withQuery("select u.id, u.name from user_table").withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create("com.mysql.jdbc.Driver", "jdbc:mysql://google/nameDB_new?cloudSqlInstance=staging:europe-west1:sql-staging-instance&socketFactory=com.google.cloud.sql.mysql.SocketFactory&useUnicode=true&characterEncoding=UTF-8&user=user&password=password&useSSL=false")).withRowMapper(new RowMapper < List < String >> () {

        @Override public List < String > mapRow(ResultSet resultSet) throws Exception {
            List < String > addRow = new ArrayList < String > ();


            for (int i = 1; i <= resultSet.getMetaData().getColumnCount(); i++) {
                addRow.add(i - 1, String.valueOf(resultSet.getObject(i)));
            }

            //LOG.info(String.join(",", addRow));

            return addRow;

        }
    })

    .withCoder(ListCoder.of(StringUtf8Coder. < Object > of ()))

    );

    Write w = ElasticsearchIO.write().withConnectionConfiguration(
        ElasticsearchIO.ConnectionConfiguration.create(new String[] {
            "https://host:9243"
        }, "user-temp", "String").withUsername("elastic").withPassword("password")
    );

    rows.apply(w.compose(new SerializableFunction() {

       @Override public Object apply(Object input) {
         // TODO Auto-generated method stub
         return input;
        }
    }));


    p.run().waitUntilFinish();

}

and below is the pom.xml file :

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.harmonica.dataflow</groupId>
  <artifactId>com-harmonica-dataflow</artifactId>
  <version>0.0.1-SNAPSHOT</version>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven-compiler-plugin.version>3.7.0</maven-compiler-plugin.version>
    <exec-maven-plugin.version>1.6.0</exec-maven-plugin.version>
    <slf4j.version>1.7.25</slf4j.version>
    <beam.version>2.19.0</beam.version>
  </properties>

  <repositories>
    <repository>
      <id>ossrh.snapshots</id>
      <name>Sonatype OSS Repository Hosting</name>
      <url>https://oss.sonatype.org/content/repositories/snapshots/</url>
      <releases>
        <enabled>false</enabled>
      </releases>
      <snapshots>
        <enabled>true</enabled>
      </snapshots>
    </repository>
  </repositories>

  <build>
   <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>${maven-compiler-plugin.version}</version>
        <configuration>
          <source>1.8</source>
          <target>1.8</target>
        </configuration>
      </plugin>
    </plugins>

    <pluginManagement>
      <plugins>
        <plugin>
          <groupId>org.codehaus.mojo</groupId>
          <artifactId>exec-maven-plugin</artifactId>
          <version>${exec-maven-plugin.version}</version>
          <configuration>
            <cleanupDaemonThreads>false</cleanupDaemonThreads>
          </configuration>
        </plugin>
      </plugins>
    </pluginManagement>
  </build>

  <dependencies>
<!--  Beam Lib -->
<dependency>
      <groupId>org.apache.beam</groupId>
      <artifactId>beam-sdks-java-core</artifactId>
      <version>${beam.version}</version>
    </dependency>
    
    <dependency>
      <groupId>org.apache.beam</groupId>
      <artifactId>beam-runners-google-cloud-dataflow-java</artifactId>
      <version>${beam.version}</version>        
    </dependency>
    
    <dependency>
    <groupId>org.apache.beam</groupId>
    <artifactId>beam-sdks-java-io-elasticsearch</artifactId>
    <version>${beam.version}</version>  
    </dependency>

    <dependency>
        <groupId>org.apache.beam</groupId>
        <artifactId>beam-sdks-java-io-jdbc</artifactId>
        <version>${beam.version}</version>
    </dependency>
    
    
    
    <dependency>
      <groupId>org.apache.beam</groupId>
      <artifactId>beam-sdks-java-io-google-cloud-platform</artifactId>
      <version>${beam.version}</version>
      
    </dependency>


        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.19</version>           
        </dependency>

<dependency>
    <groupId>com.google.cloud.sql</groupId>
    <artifactId>mysql-socket-factory-connector-j-8</artifactId>
    <version>1.0.15</version>
</dependency>


    <!-- slf4j API frontend binding with JUL backend -->
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-api</artifactId>
      <version>${slf4j.version}</version>
    </dependency>
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-jdk14</artifactId>
      <version>${slf4j.version}</version>
    </dependency>
  </dependencies>
</project>

and when I execute this command:

man exec mvn compile exec:java -Dexec.mainClass=com.dataflow.ElasticSearchIO 

The worker started successfully but then cant connect to the the Cloud SQL: even thought I've done the flowing:

  • I've created a service account with owner access to the project and passed it to the runner options
  • I've created a VPC network with a name of new-network with a range of IP of 190.10.0.0/16 and assigned it to the pipeline options and then whitlisted this range in the cloud SQL

and however Im still getting this error:

Error message from worker: java.lang.RuntimeException: org.apache.beam.sdk.util.UserCodeException: java.sql.SQLException: Cannot create PoolableConnectionFactory (Communications link failure The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server.) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory$1.typedApply(IntrinsicMapTaskExecutorFactory.java:194) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory$1.typedApply(IntrinsicMapTaskExecutorFactory.java:165) org.apache.beam.runners.dataflow.worker.graph.Networks$TypeSafeNodeFunction.apply(Networks.java:63) org.apache.beam.runners.dataflow.worker.graph.Networks$TypeSafeNodeFunction.apply(Networks.java:50) org.apache.beam.runners.dataflow.worker.graph.Networks.replaceDirectedNetworkNodes(Networks.java:87) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory.create(IntrinsicMapTaskExecutorFactory.java:125) org.apache.beam.runners.dataflow.worker.BatchDataflowWorker.doWork(BatchDataflowWorker.java:352) org.apache.beam.runners.dataflow.worker.BatchDataflowWorker.getAndPerformWork(BatchDataflowWorker.java:305) org.apache.beam.runners.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.doWork(DataflowBatchWorkerHarness.java:140) org.apache.beam.runners.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.call(DataflowBatchWorkerHarness.java:120) org.apache.beam.runners.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.call(DataflowBatchWorkerHarness.java:107) java.util.concurrent.FutureTask.run(FutureTask.java:266) java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) java.lang.Thread.run(Thread.java:748) Caused by: org.apache.beam.sdk.util.UserCodeException: java.sql.SQLException: Cannot create PoolableConnectionFactory (Communications link failure The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server.) org.apache.beam.sdk.util.UserCodeException.wrap(UserCodeException.java:34) org.apache.beam.sdk.io.jdbc.JdbcIO$ReadFn$DoFnInvoker.invokeSetup(Unknown Source) org.apache.beam.runners.dataflow.worker.DoFnInstanceManagers$ConcurrentQueueInstanceManager.deserializeCopy(DoFnInstanceManagers.java:80) org.apache.beam.runners.dataflow.worker.DoFnInstanceManagers$ConcurrentQueueInstanceManager.peek(DoFnInstanceManagers.java:62) org.apache.beam.runners.dataflow.worker.UserParDoFnFactory.create(UserParDoFnFactory.java:95) org.apache.beam.runners.dataflow.worker.DefaultParDoFnFactory.create(DefaultParDoFnFactory.java:75) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory.createParDoOperation(IntrinsicMapTaskExecutorFactory.java:264) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory.access$000(IntrinsicMapTaskExecutorFactory.java:86) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory$1.typedApply(IntrinsicMapTaskExecutorFactory.java:183) ... 14 more Caused by: java.sql.SQLException: Cannot create PoolableConnectionFactory (Communications link failure The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server.) org.apache.commons.dbcp2.BasicDataSource.createPoolableConnectionFactory(BasicDataSource.java:735) org.apache.commons.dbcp2.BasicDataSource.createDataSource(BasicDataSource.java:605) org.apache.commons.dbcp2.BasicDataSource.getConnection(BasicDataSource.java:809) org.apache.beam.sdk.io.jdbc.JdbcIO$ReadFn.setup(JdbcIO.java:881) Caused by: com.mysql.cj.jdbc.exceptions.CommunicationsException: Communications link failure The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server. com.mysql.cj.jdbc.exceptions.SQLError.createCommunicationsException(SQLError.java:174) com.mysql.cj.jdbc.exceptions.SQLExceptionsMapping.translateException(SQLExceptionsMapping.java:64) com.mysql.cj.jdbc.ConnectionImpl.createNewIO(ConnectionImpl.java:836) com.mysql.cj.jdbc.ConnectionImpl.(ConnectionImpl.java:456) com.mysql.cj.jdbc.ConnectionImpl.getInstance(ConnectionImpl.java:246) com.mysql.cj.jdbc.NonRegisteringDriver.connect(NonRegisteringDriver.java:197) org.apache.commons.dbcp2.DriverConnectionFactory.createConnection(DriverConnectionFactory.java:53) org.apache.commons.dbcp2.PoolableConnectionFactory.makeObject(PoolableConnectionFactory.java:355) org.apache.commons.dbcp2.BasicDataSource.validateConnectionFactory(BasicDataSource.java:116) org.apache.commons.dbcp2.BasicDataSource.createPoolableConnectionFactory(BasicDataSource.java:731) org.apache.commons.dbcp2.BasicDataSource.createDataSource(BasicDataSource.java:605) org.apache.commons.dbcp2.BasicDataSource.getConnection(BasicDataSource.java:809) org.apache.beam.sdk.io.jdbc.JdbcIO$ReadFn.setup(JdbcIO.java:881) org.apache.beam.sdk.io.jdbc.JdbcIO$ReadFn$DoFnInvoker.invokeSetup(Unknown Source) org.apache.beam.runners.dataflow.worker.DoFnInstanceManagers$ConcurrentQueueInstanceManager.deserializeCopy(DoFnInstanceManagers.java:80) org.apache.beam.runners.dataflow.worker.DoFnInstanceManagers$ConcurrentQueueInstanceManager.peek(DoFnInstanceManagers.java:62) org.apache.beam.runners.dataflow.worker.UserParDoFnFactory.create(UserParDoFnFactory.java:95) org.apache.beam.runners.dataflow.worker.DefaultParDoFnFactory.create(DefaultParDoFnFactory.java:75) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory.createParDoOperation(IntrinsicMapTaskExecutorFactory.java:264) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory.access$000(IntrinsicMapTaskExecutorFactory.java:86) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory$1.typedApply(IntrinsicMapTaskExecutorFactory.java:183) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory$1.typedApply(IntrinsicMapTaskExecutorFactory.java:165) org.apache.beam.runners.dataflow.worker.graph.Networks$TypeSafeNodeFunction.apply(Networks.java:63) org.apache.beam.runners.dataflow.worker.graph.Networks$TypeSafeNodeFunction.apply(Networks.java:50) org.apache.beam.runners.dataflow.worker.graph.Networks.replaceDirectedNetworkNodes(Networks.java:87) org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory.create(IntrinsicMapTaskExecutorFactory.java:125) org.apache.beam.runners.dataflow.worker.BatchDataflowWorker.doWork(BatchDataflowWorker.java:352) org.apache.beam.runners.dataflow.worker.BatchDataflowWorker.getAndPerformWork(BatchDataflowWorker.java:305) org.apache.beam.runners.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.doWork(DataflowBatchWorkerHarness.java:140) org.apache.beam.runners.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.call(DataflowBatchWorkerHarness.java:120) org.apache.beam.runners.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.call(DataflowBatchWorkerHarness.java:107) java.util.concurrent.FutureTask.run(FutureTask.java:266) java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) java.lang.Thread.run(Thread.java:748) Caused by: com.mysql.cj.exceptions.CJCommunicationsException: Communications link failure The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server. sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) java.lang.reflect.Constructor.newInstance(Constructor.java:423) com.mysql.cj.exceptions.ExceptionFactory.createException(ExceptionFactory.java:61) com.mysql.cj.exceptions.ExceptionFactory.createException(ExceptionFactory.java:105) com.mysql.cj.exceptions.ExceptionFactory.createException(ExceptionFactory.java:151) com.mysql.cj.exceptions.ExceptionFactory.createCommunicationsException(ExceptionFactory.java:167) com.mysql.cj.protocol.a.NativeSocketConnection.connect(NativeSocketConnection.java:91) com.mysql.cj.NativeSession.connect(NativeSession.java:144) com.mysql.cj.jdbc.ConnectionImpl.connectOneTryOnly(ConnectionImpl.java:956) com.mysql.cj.jdbc.ConnectionImpl.createNewIO(ConnectionImpl.java:826) ... 32 more Caused by: java.net.ConnectException: Connection timed out (Connection timed out) java.net.PlainSocketImpl.socketConnect(Native Method) java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350) java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206) java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188) java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392) java.net.Socket.connect(Socket.java:589) sun.security.ssl.SSLSocketImpl.connect(SSLSocketImpl.java:673) sun.security.ssl.BaseSSLSocketImpl.connect(BaseSSLSocketImpl.java:173) com.google.cloud.sql.core.CoreSocketFactory.createSslSocket(CoreSocketFactory.java:233) com.google.cloud.sql.core.CoreSocketFactory.connect(CoreSocketFactory.java:185) com.google.cloud.sql.mysql.SocketFactory.connect(SocketFactory.java:48) com.google.cloud.sql.mysql.SocketFactory.connect(SocketFactory.java:38) com.mysql.cj.protocol.a.NativeSocketConnection.connect(NativeSocketConnection.java:65) ... 35 more

Plz any help will be highly appreciated! Thanks in advance

Soumitri Pattnaik
  • 3,246
  • 4
  • 24
  • 42
Tamer Saleh
  • 473
  • 9
  • 21
  • Have you tried any of the workarounds provided on [this other thread](https://stackoverflow.com/questions/44699643/connecting-to-cloud-sql-from-dataflow-job)? – rsalinas Feb 19 '20 at 15:08

2 Answers2

1

You can use below piece of code to establish the connection:

Pipeline p = Pipeline.create(options);

    //Increase pool size based on your records

    ComboPooledDataSource dataSource = new ComboPooledDataSource();

    dataSource.setDriverClass("com.mysql.jdbc.Driver");
    dataSource.setJdbcUrl(
            "jdbc:mysql://google/test?cloudSqlInstance=dataflowtest-:us-central1:sql-test&socketFactory=com.google.cloud.sql.mysql.SocketFactory");
    dataSource.setUser("root");
    dataSource.setPassword("root");
    dataSource.setMaxPoolSize(10);
    dataSource.setInitialPoolSize(6);

    JdbcIO.DataSourceConfiguration config = JdbcIO.DataSourceConfiguration.create(dataSource);

    // ADD rewriteBatchedStatements=true to improve write speed"

    PCollection<KV<String, String>> sqlResult = p.apply(JdbcIO.<KV<String, String>>read()
            .withDataSourceConfiguration(config)
            .withQuery("select * from test_table").withCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))
            .withRowMapper(new JdbcIO.RowMapper<KV<String, String>>() {

                private static final long serialVersionUID = 1L;

                public KV<String, String> mapRow(ResultSet resultSet) throws Exception {
                    return KV.of(resultSet.getString(1), resultSet.getString(2));
                }
            }));

Add below dependency in pom.xml

    <dependency>
            <groupId>org.apache.beam</groupId>
            <artifactId>beam-sdks-java-io-jdbc</artifactId>
            <version>2.17.0</version>
        </dependency>
<dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.25</version>
        </dependency>

        <dependency>
            <groupId>com.google.cloud.sql</groupId>
            <artifactId>mysql-socket-factory</artifactId>
            <version>1.0.0</version>
        </dependency>

This should work..

miles212
  • 383
  • 3
  • 20
0

If possible try the below code for sql connection:

    connection = connectToCloudSql(map.get(LiteralConstant.URL.toString()),
            map.get(LiteralConstant.USERNAME.toString()), map.get(LiteralConstant.PASSWORD.toString()));

Then use the below piece of code to get result from sql connection:

statement = connection.prepareCall("query"); 
statement.execute();
resultSet = statement.getResultSet();
ResultSetMetaData rsmd = resultSet.getMetaData();

int count = rsmd.getColumnCount();
if(!resultSet.next() || count < 1)
    throw new ConnectionFailureException("Failed to connect to Cloud SQL");

for (int k = 1; k <= count; k++) {
    row.set(rsmd.getColumnName(k), resultSet.getString(k));
}

Get the above result in PCollection Note: Don't forget to enable Cloud sql api and Cloud sql admin api.

Maven dependency:

<dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>5.1.25</version>
</dependency>
<dependency>
    <groupId>com.google.cloud.sql</groupId>
    <artifactId>mysql-socket-factory</artifactId> <!-- mysql-socket-factory-connector-j-6 if using 6.x.x -->
    <version>1.0.0</version>
</dependency>

This above piece of code worked in my case. Let me know in case this solution works for you.

miles212
  • 383
  • 3
  • 20
  • Instead of creating template, run your code in direct runner and check whether sql connection is happening or not? Have you enabled all the cloud sql api in your project? – miles212 Feb 13 '20 at 14:20
  • yes its working at direct runner, and yes all Cloud Sql APIs are enabled – Tamer Saleh Feb 13 '20 at 14:26
  • Did you able to fetch any row from Cloud SQL through DirectRunner? Just check the connection object val. I was talking about CLOUD SQL ADMIN API. The same issue I faced some months back, it's because of this API, Otherwise I don't see anything wrong in your code, but I usually go with getConnection for this type of use-cases. – miles212 Feb 13 '20 at 14:33
  • so any way to trace this? when I put wrong name in the cloud instance I got an error of the instance not found, so it seems that it can find the DB instance, when I see the Cloud sql logs I see nothing – Tamer Saleh Feb 13 '20 at 14:39
  • I believe you have have few tables in CloudSQL, get count(*) from any of the table and log that value in ParDO. – miles212 Feb 13 '20 at 14:42
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/207771/discussion-between-miles212-and-tamer-saleh). – miles212 Feb 13 '20 at 14:43
  • Hey miles212@ - Can you update your snippet to use more recent versions of mysql-connector-java and mysql-socket-factory? the 5.x package is very outdated – kurtisvg Feb 13 '20 at 16:34
  • @kurtisvg I’m using the latest and still having the issue – Tamer Saleh Feb 14 '20 at 10:59
  • @kurtisvg its working fine using DirectRunner but still same issue with the DataFlow runner, there is a problem connecting to the Cloud SQL – Tamer Saleh Feb 14 '20 at 11:12