0

I am trying to push (Post) pdf files to Solr/Tika for text extraction and indexing using Ajax/js. I've gotten the following curl command to work:

curl 'http://localhost:8983/solr/techproducts/update/extract?literal.id=doc1&commit=true' -F "myfile=@/PathToFile/SomeDoc.pdf"

This command puts the desired pdf into the Solr Index, and I can retrieve it just fine. However, I need to be able to do this from a web browsers. After much googling, and a little experimentation I've got the following js code ALMOST working. It returns a 0 status code, and status of Success, but nothing gets committed to the index:

   $("#solrPost").click(function(event) {
        event.stopPropagation();
        event.preventDefault();

        /* Read a local pdf file as a blob */
        let fileAsBlob = null;
        let file = $('#upload_file')[0].files[0];
        let myReader = new FileReader();

        myReader.onloadend = function() {
            fileAsBlob = myReader.result;
            sendToSolr(fileAsBlob); 
        };
        fileAsBlob = myReader.readAsArrayBuffer(file);

        function sendToSolr(fileAsBlob) {
            $.ajax({ 
                url:"http://localhost:8983/solr/techproducts/update/extract?literal.id=doc2&commit=true",
                type: 'POST',
                data: fileAsBlob,
                cache: false,
                crossOrigin: true,
                dataType: 'jsonp',
                jsonp: 'json.wrf',
                processData: false,
                contentType: false, 

                success: function(data, status) {
                    console.log("Ajax.post successful, status: " + data.responseHeader.status + "\t status text: " + status);
                    console.log("debug");
                },
                error: function(data, status) {
                    console.log("Ajax.post error, status: " + data.status + "\t status text:" + data.statusText);
                },
                done: function(data, status) {
                    console.log("Ajax.post Done");
                }
            });
        }

This is SO close to working, but I just can't figure out what's going wrong. All indications (From client side) are good, but nothing added to the index. Note:

  1. The fileReader is working, I see an Array of the same size as the source pdf.
  2. Even though I specify POST, when I examine the network tab in the browser/debugger, it says GET.
  3. I've hardcoded the literal.id=doc2 for simplicity, not a long term strategy...

I know there are similar posts, but none address the issue of extracting pdf's using Solr/Tika outside of the provided post script. Thanks in advance.

GeoffWillis
  • 113
  • 2
  • 13
  • Solr doesn't set any CORS options - are you sure you're even allowed to make the POST to Solr from Javascript? – MatsLindh Oct 11 '18 at 19:59
  • @MatsLindh- Wasn't sure, so used the Admin API to create/submit a doc: `http://localhost:8983/solr/techproducts/update?_=1539289351207&commitWithin=1000&overwrite=true&wt=json)`---It worked. I assumed the jsonp would handle the CORS issues since I needed to add it to retrieve the docs I'd posted using the curl command – GeoffWillis Oct 11 '18 at 20:46
  • But jsonp is a way to work around the requirement for CORS headers - to submit data that way you'd probably have to do a hidden iframe, create an actual form (with the file content as a hidden field, but I'm not sure if that's going to work) and submit it, instead of going through javascript. When the POST request happens, usually there's a pre-flight OPTIONS request to determine if the request is valid. – MatsLindh Oct 12 '18 at 06:54
  • I have seen some examples using new formData() and adding the data to it. Will try that after work. Thanks for the input, will get back to you. – GeoffWillis Oct 12 '18 at 12:15

1 Answers1

0

Well it took some searching but thanks to a post by "tonejac" I found the solution. If you look at: [JQuery Ajax is sending GET instead of POST The VERY last comment states that if you use dataType:jsonp that "POST" gets converted to "GET". I deleted the jsonp, installed a plugin to handle the CORS issue I was trying to avoid by using jsonp, and viola, it worked. For those interested, the working code is posted below. It's not fancy or robust but allows me to post or get documents (.pdf, .docx...) to Solr from a web app. I've only posted the js code, but the html is simple and provides an input of type "file", as well as inputs to set id for posting docs, or searching by id. There are two buttons, solrPost, and solrGet which call the listeners in the js. The connectSolr() function is called from the html onLoad.

function connectSolr() {
$("#solrPost").click(function(event) {
    event.stopPropagation();
    event.preventDefault();

    /* Read a local pdf file as a blob */
    let fileAsBlob = null;
    let file = $('#upload_file')[0].files[0];
    let myReader = new FileReader();

    myReader.onloadend = function() {
        fileAsBlob = myReader.result;

        sendToSolr(fileAsBlob); 
    };
    fileAsBlob = myReader.readAsArrayBuffer(file);
    /* Get the unique Id for the doc and append to the extract url*/
    let docId = $("#userSetId").val();
    let extractUrl = "http://localhost:8983/solr/techproducts/update/extract/?commit=true&literal.id=" + docId;


    /* Ajax call to Solr/Tika to extract text from pdf and index it */
    function sendToSolr(fileAsBlob) {
        $.ajax({ 
            url: extractUrl,
            type: 'POST',
            data: fileAsBlob,
            cache: false,
            jsonp: 'json.wrf',
            processData: false,
            contentType: false, 
            echoParams: "all",

            success: function(data, status) {
                console.log("Ajax.post successful, status: " + data.responseHeader.status + "\t status text: " + status);
                console.log("debug");
            },
            error: function(data, status) {
                console.log("Ajax.post error, status: " + data.status + "\t status text:" + data.statusText);
            },
            done: function(data, status) {
                console.log("Ajax.post Done");
            },
        });
    }
});


$("#solrGet").click(function(event) {
    event.stopPropagation();
    event.preventDefault();
    let docId = "id:" + $("#docId").val();
    $.ajax({
        url:"http://localhost:8983/solr/techproducts/select/",
        type: "get",
        dataType: "jsonp",
        data: {
            q: docId
            //wt: "json",
            //indent: "true"
        },
        jsonp: "json.wrf",
        //"json.wrf": "?",
        success: function(data, status) {
            renderDoc(data, status);
        },
        error: function(data, status) {
            console.log("Ajax.get error, Error: " + status);
        },
        done: function(data, status) {
            console.log("Ajax.get Done");
        }
    });
    console.log("Debug");
});


let  renderDoc = function(theText, statusCode) {
    let extractedText = theText.response.docs[0].content[0];
    let extractedLinks = theText.response.docs[0].links;
    let $textArea = $("#textArea");
    $textArea.empty();
    let sents = extractedText.split('\n')
    sents.map(function(element, i) {
        let newSpan = $("<span />");
        $textArea.append(newSpan.html(element).append("<br/>"));
    });
    console.log("debug");
};

}

GeoffWillis
  • 113
  • 2
  • 13