2

there is a file called file.txt storing config which i'd like to use in my UDF as shown in the code below.

public class ParseTraceIdUDF extends GenericUDF {
    private transient ObjectInspector[] argumentOIs;
    public static String dataFile = "file.txt";
    public static final String SEP = "-";
    public static HashSet<String> targetTraces = new HashSet<String>();
    public static HashSet<String> targets = new HashSet<String>();     

    public void ReadFile() {
        try (
                FileInputStream fis = new FileInputStream(dataFile);
                InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
        ) {
            BufferedReader br = new BufferedReader(isr);
            String line;
            String[] tmp;
            int length;

            while ((line = br.readLine()) != null) {
                line = line.trim();
                targetTraces.add(line);

                tmp = line.split(SEP);
                length = tmp.length;
                targets.addAll(Arrays.asList(tmp).subList(0, length - 1));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public ObjectInspector initialize(ObjectInspector[] args)
            throws UDFArgumentException {
        if (args.length > 2)
            throw new UDFArgumentLengthException("The operator accepts at most 2 arguments.");
        ReadFile();
        argumentOIs = args;
        return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
    }
    ...
}

it runs well when i put the file in local file system and write in hive like this

ADD JAR /***/***/rd.jar;
ADD /**/**/file.txt;
CREATE TEMPORARY FUNCTION parse_trace_id AS 'com.hive.udf.PTIdUDF';
...

but when i wanna to use the config on hdfs like this, it goes wrong

ADD JAR /***/***/rd.jar;
ADD FILE hdfs://**/**/file.txt;
CREATE TEMPORARY FUNCTION parse_trace_id AS 'com.hive.udf.PTIdUDF';
...

the error log is as follows and i wonder what's the reason for it. appreciate it if i could get a help

java.io.FileNotFoundException: file.txt (No such file or directory)
    at java.io.FileInputStream.open0(Native Method)
    at java.io.FileInputStream.open(FileInputStream.java:195)
    at java.io.FileInputStream.<init>(FileInputStream.java:138)
    at java.io.FileInputStream.<init>(FileInputStream.java:93)
    at com.vivo.hive.udf.ParseTraceIdUDF.ReadFile(ParseTraceIdUDF.java:35)
    at com.vivo.hive.udf.ParseTraceIdUDF.initialize(ParseTraceIdUDF.java:71)
    at org.apache.hadoop.hive.ql.udf.generic.GenericUDF.initializeAndFoldConstants(GenericUDF.java:145)
    at org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc.newInstance(ExprNodeGenericFuncDesc.java:233)
    at org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory$DefaultExprProcessor.getXpathOrFuncExprNodeDesc(TypeCheckProcFactory.java:959)
    at org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory$DefaultExprProcessor.process(TypeCheckProcFactory.java:1176)
    at org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher.dispatch(DefaultRuleDispatcher.java:90)
    at org.apache.hadoop.hive.ql.lib.DefaultGraphWalker.dispatchAndReturn(DefaultGraphWalker.java:94)
    at org.apache.hadoop.hive.ql.lib.DefaultGraphWalker.dispatch(DefaultGraphWalker.java:78)
    at org.apache.hadoop.hive.ql.lib.DefaultGraphWalker.walk(DefaultGraphWalker.java:132)
    at org.apache.hadoop.hive.ql.lib.DefaultGraphWalker.startWalking(DefaultGraphWalker.java:109)
    at org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory.genExprNode(TypeCheckProcFactory.java:193)
    at org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory.genExprNode(TypeCheckProcFactory.java:146)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genAllExprNodeDesc(SemanticAnalyzer.java:10621)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genExprNodeDesc(SemanticAnalyzer.java:10577)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genSelectPlan(SemanticAnalyzer.java:3874)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genSelectPlan(SemanticAnalyzer.java:3653)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPostGroupByBodyPlan(SemanticAnalyzer.java:9029)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genBodyPlan(SemanticAnalyzer.java:8984)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9851)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9744)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genOPTree(SemanticAnalyzer.java:10217)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10228)
    at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10108)
    at org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:223)
    at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:558)
    at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1356)
    at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1473)
    at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1285)
    at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1275)
    at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:226)
    at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:175)
    at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:389)
    at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:324)
    at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:726)
    at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:699)
    at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:634)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
    at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
FAILED: NullPointerException null
Js_zero
  • 63
  • 7
  • Have you verified that the file you are looking for can be found? As in `hdfs dfs -ls hdfs://**/**/file.txt`? – RealSkeptic Mar 04 '20 at 12:11
  • @RealSkeptic yes, i'm sure about that. i thought i've found out the reson by using `list FILES` commadn. when i use the local file in current dir where i run the sql, hive found the file in current dir. but when using hdfs file, the file will be download to the tmp dir, and udf cannot find it in current dir, i think we can use absolute path to locate it. but the tmp dir is random, i think it doesn't goes well – Js_zero Mar 04 '20 at 12:23
  • I've never seen a UDF that relies on a file before. Why can't you read it from HDFS within the UDF, though? – OneCricketeer Mar 04 '20 at 13:09
  • Does this help https://stackoverflow.com/questions/23751702/confusion-about-distributed-cache-in-hadoop? Maybe all you need is add `./` to your `file.txt`. – mazaneicha Mar 04 '20 at 17:01
  • @cricket_007 i've tried the way you said, but I encountered a permission issue. it seems difficult to get config in udf according to this https://stackoverflow.com/questions/27402442/read-an-hdfs-file-from-a-hive-udf-execution-error-return-code-101-functiontas – Js_zero Mar 05 '20 at 03:37
  • @mazaneicha i think both `file.txt` and `./file.txt` is located at current dir, it may not work out – Js_zero Mar 05 '20 at 03:44
  • @js-zero My suggestion was based on the example from DistributedCache javadoc: http://hadoop.apache.org/docs/r2.7.1/api/org/apache/hadoop/filecache/DistributedCache.html – mazaneicha Mar 05 '20 at 13:13
  • @mazaneicha i‘ve seen the doc you shared. it seems can be used in mr. but it still confuses me how to use in udf. could u show some example code, thanks a lot – Js_zero Mar 20 '20 at 02:15

0 Answers0