0

I have a custom training job that I run on a fixed schedule using Cloud Scheduler. When I create the job using either the Python client or gcp, the job runs fine. However, when I create the cloud scheduler job using the Java SDK, the job gets created but it fails. The SUMMARY of the error message I get in Cloud Logging is:

{"@type":"type.googleapis.com/google.cloud.scheduler.logging.AttemptFinished", "jobName":"projects/{my_project_id}/locations/us-central1/jobs/java_job", "status":"INVALID_ARGUMENT", "targetType":"HTTP", "url":"https://us-central1-aiplatform.googleapis.com/v1/projects/{my_project_id}/locations/us-central1/customJobs"}

I looked at the jobs created in gcp, all fields for the three jobs (the one created using python client, the one created using java SDK and the one created directly in gcp) are the same. I cannot figure out why the job created using the Java SDK keeps failing.

Java SDK code:

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import com.google.cloud.scheduler.v1.Job;
import com.google.cloud.scheduler.v1.LocationName;
import com.google.cloud.scheduler.v1.OAuthToken;
import com.google.protobuf.ByteString;
import com.google.cloud.scheduler.v1.CloudSchedulerClient;
import com.google.cloud.scheduler.v1.HttpMethod;
import com.google.cloud.scheduler.v1.HttpTarget;


public class Temp 
{
    
    static String projectId = "...";
    static String location = "...";
    static String serviceAccountEmail = "...-compute@developer.gserviceaccount.com";
    static String outputUriPrefix = "gs://.../.../";
    static String imageUri = String.format("%s-docker.pkg.dev/%s/.../...", location, projectId);
    
    static String trainingJobName = "custom_training_job";
    static String schedulerJobName = String.format("projects/%s/locations/%s/jobs/java_job", projectId, location);
    static String scope = "https://www.googleapis.com/auth/cloud-platform";
    static String httpTargetUri = String.format("https://%s-aiplatform.googleapis.com/v1/projects/%s/locations/%s/customJobs", 
            location, projectId, location);
    static String machineType = "n1-standard-4";
    static long replicaCount = 1;
    
    
    static String getJobBody() throws JSONException {
        JSONObject jobBody = new JSONObject();
        jobBody.put("display_name", trainingJobName);
        JSONObject base_output_directory = new JSONObject();
        base_output_directory.put("output_uri_prefix", outputUriPrefix);
        jobBody.put("base_output_directory", base_output_directory);
        JSONObject jobSpec = new JSONObject();
        JSONArray worker_pool_specs = new JSONArray();
        JSONObject spec = new JSONObject();
        spec.put("replica_count", replicaCount);
        JSONObject machine_spec = new JSONObject();
        machine_spec.put("machine_type", machineType);
        spec.put("machine_spec", machine_spec);
        JSONObject container_spec = new JSONObject();
        container_spec.put( "image_uri", imageUri);
        JSONArray args = new JSONArray();
        args.put("--msg=hello!");
        container_spec.put( "args", args);
        spec.put("container_spec", container_spec);
        worker_pool_specs.put(spec);
        jobSpec.put("worker_pool_specs", worker_pool_specs);
        jobBody.put("job_spec", jobSpec);
        return jobBody.toString();
    }
    
    public static void main( String[] args ) throws IOException, JSONException
    {
        System.out.println(String.format("=======STARTING APPLICATION, version %s =======", "v5"));
        
        CloudSchedulerClient client = CloudSchedulerClient.create();
        
        String parent = LocationName.of(projectId, location).toString();
        
        Map<String, String> headers = new HashMap<String, String>();
        headers.put("User-Agent", "Google-Cloud-Scheduler");
        headers.put("Content-Type", "application/json; charset=utf-8");
        
        OAuthToken token = OAuthToken.newBuilder()
                .setServiceAccountEmail(serviceAccountEmail)
                .setScope(scope)
                .build();       
                
        HttpTarget httpTarget = HttpTarget.newBuilder()
                .setUri(httpTargetUri)
                .setHttpMethod(HttpMethod.POST)
                .putAllHeaders(headers)
                .setBody(ByteString.copyFromUtf8(getJobBody()))
                .setOauthToken(token)
                .build();   
        
        Job job = Job.newBuilder()
                .setName(schedulerJobName)
                .setDescription("test java job")
                .setSchedule("* * * * *")
                .setTimeZone("Africa/Abidjan")
                .setHttpTarget(httpTarget)
                .build();
        
        client.createJob(parent, job);
        client.close();
    }
}

Python Client code:

from google.cloud import scheduler
import json


project_id = "..."
location = "..."
service_account_email = "...-compute@developer.gserviceaccount.com"
output_uri_prefix="gs://.../.../"
image_uri=f'{location}-docker.pkg.dev/{project_id}/.../...'

traning_job__name ="custom_training_job"
scheduler_job_name = f'projects/{project_id}/locations/{location}/jobs/python_job'
scope = "https://www.googleapis.com/auth/cloud-platform"
http_target_uri = f'https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/customJobs'
machine_type = "n1-standard-4"
replica_count = 1


job_spec = {
    "display_name": traning_job__name,
    "job_spec": {
            "worker_pool_specs": [
                {
                    "machine_spec": {
                        "machine_type": machine_type,
                    },
                    "replica_count": replica_count,
                    "container_spec": {
                        "image_uri": image_uri,
                        "args": [
                            "--msg=hello!"
                        ]
                    }
                }
            ],
        "base_output_directory": {
            "output_uri_prefix": output_uri_prefix
        }
    }
}


job = {
  "name": scheduler_job_name,
  "description": "Created from Python client",
  "http_target": {
    "uri": http_target_uri,
    "http_method": "POST",
    "headers": {
      "User-Agent": "Google-Cloud-Scheduler",
      "Content-Type": "application/json; charset=utf-8"
    },
    "body": json.dumps(job_spec).encode('utf-8'),
    "oauth_token": {
      "service_account_email": service_account_email,
      "scope": scope
    }
  },
  "schedule": "* * * * *",
  "time_zone": "Africa/Abidjan"
}


client = scheduler.CloudSchedulerClient()
parent = f'projects/{project_id}/locations/{location}' 
response = client.create_job(parent = parent, job = job)

EDIT

The problem was that in the getJobBody function, I was setting base_output_directory as a top level field, whereas it should be a nested field inside the job_spec. The problem is solved but is there a better way to do this? I know there is a CustomJobSpec class, but could not find a way to convert it into a Json style string.

racerX
  • 930
  • 9
  • 25
  • {my_project_id} is a sub for the actual projectId – racerX Sep 08 '21 at 04:39
  • Could you share the actual code that creates the jobs in Python client or Cloud SDK and also the one using the Java SDK so we can compare? – Ralemos Sep 08 '21 at 11:57
  • Added the Python and Java codes in the question – racerX Sep 08 '21 at 16:19
  • Sorry for the late reply, I just saw your edit to the question saying that the issue is fixed. I believe that is the way to go indeed. Could you add that into an answer and accept it? That way, others in the community that go through the same issue will easily find your solution. – Ralemos Sep 13 '21 at 14:43
  • Hi @RafaelLemos, added the answer, had not added it till now because it just seemed like a typo kind of mistake on my part so wasn't sure if that would be of any help. Thought if I left it unanswered someone might point out a better way of doing it. – racerX Sep 13 '21 at 19:24

1 Answers1

1

As mentioned in the edit, the problem was that in the getJobBody function, the base_output_directory was being set as a top level field, whereas it should be a nested field inside the job_spec. So currently, as far as I know, the way to avoid this mistake is to set the jobBody carefully, I don't know of a way to do this in a more structured manner.

racerX
  • 930
  • 9
  • 25