0

Tried the following:

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.spi.FileTypeDetector;
import org.apache.tika.Tika;
import org.apache.tika.mime.MimeTypes;

/**
 *
 * @author kiriti.k
 */
public class TikaFileTypeDetector {

    private final Tika tika = new Tika();

    public TikaFileTypeDetector() {
        super();
    }

    public String probeContentType(Path path) throws IOException {
        // Try to detect based on the file name only for efficiency
        String fileNameDetect = tika.detect(path.toString());
        if (!fileNameDetect.equals(MimeTypes.OCTET_STREAM)) {
            return fileNameDetect;
        }

        // Then check the file content if necessary
        String fileContentDetect = tika.detect(path.toFile());
        if (!fileContentDetect.equals(MimeTypes.OCTET_STREAM)) {
            return fileContentDetect;
        }

        // Specification says to return null if we could not 
        // conclusively determine the file type
        return null;
    }

    public static void main(String[] args) throws IOException {

        Tika tika = new Tika();

        // expects file path as the program argument
        if (args.length != 1) {
            printUsage();
            return;
        }

        Path path = Paths.get(args[0]);

        TikaFileTypeDetector detector = new TikaFileTypeDetector();
        // Analyse the file - first based on file name for efficiency.
        // If cannot determine based on name and then analyse content
        String contentType = detector.probeContentType(path);

        System.out.println("File is of type - " + contentType);
    }

    public static void printUsage() {
        System.out.print("Usage: java -classpath ... "
                + TikaFileTypeDetector.class.getName()
                + " ");
    }
}

The above program is checking based on file extension only. How do I make it to check content type also(mime) and then determine the type. I am using tika-app-1.8.jar in netbean 8.0.2. What am I missing?

kittu
  • 6,662
  • 21
  • 91
  • 185

2 Answers2

4

The code checks the file extension first and returns the MIME type based on that, if it finds a result. If you want it to check the content first, just switch the two statements:

public String probeContentType(Path path) throws IOException {

    // Check contents first
    String fileContentDetect = tika.detect(path.toFile());
    if (!fileContentDetect.equals(MimeTypes.OCTET_STREAM)) {
        return fileContentDetect;
    }

    // Try file name only if content search was not successful
    String fileNameDetect = tika.detect(path.toString());
    if (!fileNameDetect.equals(MimeTypes.OCTET_STREAM)) {
        return fileNameDetect;
    }

    // Specification says to return null if we could not 
    // conclusively determine the file type
    return null;
}

Be aware that this may have huge performance impact.

Thomas Weller
  • 55,411
  • 20
  • 125
  • 222
  • Thank you so much. Instead of the program probing for file content, you started probing me lol – kittu Jun 17 '15 at 13:16
0

You can use Files.probeContentType(path)

dijkstra
  • 1,068
  • 2
  • 16
  • 39
  • 1
    Tried that...Its checking based on file type. Lets say user changes the extension manually and runs the program to check the type..then `Files.probeContentType(path)` shows the modified extension type – kittu Jun 17 '15 at 13:03