Indexing only "readable/parsable" text from pdf

Question

I have to index a list of PDFs(PDF-A) and for some of them there is no problem, but for others when I look the indexed content I only see a lot of diamonds with a question mark in it.

I think the problem is the font used for the document or that the content is "encapsulated" into a picture.

Is there a way to tell tika to extract only the "readable/parsable" text of a pdf?

When I query all the documents (with my java application) this is an ex. of what I see in the logfile for the content of the problematic files:

DEBUG org.apache.http.wire -  << " [\n]"> 
DEBUG org.apache.http.wire -  << "  [0xe8]?[0x1]d41d8cd98f00b204e9800998ecf8427e[0xb][0xa4][0xe5][0x81](Diverses[0xe6]=aabhpdtyan3vfsujquccemebqr4m3[0xe7][0x81]?[0xc1][0x4] [\n]"> 
DEBUG org.apache.http.wire -  << " [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << " E-Mail zur Archivierung [\n]"> 
DEBUG org.apache.http.wire -  << " [\n]"> 
DEBUG org.apache.http.wire -  << "    [\n]"> 
DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][\n]"> 
DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]"> 
DEBUG org.apache.http.wire -  << " [\n]"> 
DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]"> 
DEBUG org.apache.http.wire -  << " [\n]"> 
DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]"> 
DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]"> 
DEBUG org.apache.http.wire -  << " [\n]"> 
DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]"> 
DEBUG org.apache.http.wire -  << " [\n]"> 
DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]"> 
DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]"> 
DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]"> 
DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]"> 
DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]"> 
DEBUG org.apache.http.wire -  << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]"> 
DEBUG org.apache.http.wire -  << " [\n]"> 
DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]"> 
DEBUG org.apache.http.wire -  << " [\n]"> 
DEBUG org.apache.http.wire -  << " [0xef][0xbf][0xbd] [\n]"> 
DEBUG org.apache.http.wire -  << "  [\n]"> 
DEBUG org.apache.http.wire -  << " [\n]"> 
DEBUG org.apache.http.wire -  << " [0x9] data1.pdf [\n]">

Another problem is that for all the files (also the "good ones") at the beginning of the content field there is a long list of \n as you can also see above. How can avoid this?

Here is my schema.xml:

<?xml version="1.0" encoding="UTF-8" ?>
<schema name="simple" version="1.1">
    <types>
        <fieldtype name="string" class="solr.StrField" postingsFormat="SimpleText" />
        <fieldtype name="ignored" class="solr.TextField" />
        <fieldtype name="text" class="solr.TextField" postingsFormat="SimpleText">
            <analyzer>
                <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\n" replacement=""/>
                <tokenizer class="solr.StandardTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory" /> <!--Lowercases the letters in each token. Leaves non-letter tokens alone.-->
                <filter class="solr.ClassicFilterFactory" /> <!--Removes dots from acronyms and 's from the end of tokens. Works only on typed tokens produced by ClassicTokenizer or equivalent.-->
                <filter class="solr.TrimFilterFactory"/> <!--Trims whitespace at either end of a token. -->
                <filter class="solr.StopFilterFactory" ignoreCase="true"/> <!--Discards common words.  -->
                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
            </analyzer>
        </fieldtype>
    </types>

    <fields>
        <field name="signatureField" type="string" indexed="true" stored="true" multiValued="false" />
        <dynamicField name="ignored_*" type="ignored" multiValued="true" indexed="false" stored="false" />
        <field name="id" type="string" indexed="true" stored="true" multiValued="false" />
        <field name="rmDocumentTitle" type="string" indexed="true" stored="true" multiValued="true"/>
        <field name="fullText" indexed="true" type="text" multiValued="true" />
    </fields>

    <defaultSearchField>fullText</defaultSearchField>

    <solrQueryParser defaultOperator="OR" />
    <uniqueKey>id</uniqueKey>
</schema>

and my solrconfig.xml:

<?xml version="1.0" encoding="UTF-8" ?>
<config>
    <luceneMatchVersion>LUCENE_45</luceneMatchVersion>
    <directoryFactory name='DirectoryFactory' class='solr.MMapDirectoryFactory' />

    <codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" />

    <lib dir='${solr.core.instanceDir}\lib' />
    <lib dir="${solr.core.instanceDir}\dist\" regex="solr-cell-\d.*\.jar" />
    <lib dir="${solr.core.instanceDir}\contrib\extraction\lib" regex=".*\.jar" />

    <requestHandler name="standard" class="solr.StandardRequestHandler" default="true" />

    <requestHandler name="/update" class="solr.UpdateRequestHandler">
        <lst name="defaults">
            <str name="update.chain">deduplication</str>
        </lst>
    </requestHandler>

    <requestHandler name="/update/extract" class="solr.extraction.ExtractingRequestHandler">
        <lst name="defaults">
            <str name="captureAttr">true</str>
            <str name="lowernames">false</str>
            <str name="overwrite">false</str>
            <str name="captureAttr">true</str>
            <str name="literalsOverride">true</str>
            <str name="uprefix">ignored_</str>
            <str name="fmap.a">link</str>
            <str name="fmap.content">fullText</str>
            <!-- the configuration here could be useful for tests -->
            <str name="update.chain">deduplication</str>
        </lst>
    </requestHandler>

    <updateRequestProcessorChain name="deduplication">
        <processor
            class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
            <bool name="overwriteDupes">false</bool>
            <str name="signatureField">signatureField</str>
            <bool name="enabled">true</bool>
            <str name="fields">content</str>
            <str name="minTokenLen">10</str>
            <str name="quantRate">.2</str>
            <str name="signatureClass">solr.update.processor.TextProfileSignature</str>
        </processor>
        <processor class="solr.LogUpdateProcessorFactory" />
        <processor class="solr.RunUpdateProcessorFactory" />
    </updateRequestProcessorChain>

    <requestHandler name="/admin/"
        class="org.apache.solr.handler.admin.AdminHandlers" />

    <lockType>none</lockType>

    <admin>
        <defaultQuery>*:*</defaultQuery>
    </admin>

</config>

@cheffe Here is a very simple example of a file that gives problems: [example](http://www.fileswap.com/dl/A0aAlNH6eR/) — Francesco, Mar 24 '14 at 10:35

score 0 · Answer 1 · edited May 23 '17 at 11:57

0

Re: Diamond with a question mark - it is a non-UTF-8 char. See Why does a diamond with a questionmark in it � appear in my HTML?

Try using ASCIIFoldingFilterFactory and see if it works for you.

UPDATE:

Since that doesn't work, can you try excluding all non-ASCII chars (beginning with SPACE) using this in your analyzer chain?

<charFilter class="solr.PatternReplaceCharFilterFactory" 
            pattern="([^\x20-\x7F])" 
            replacement=""/>

(See Regex any ascii character)

edited May 23 '17 at 11:57

Community

1
1

answered Mar 14 '14 at 21:06

arun

10,685
6
59
81

I added the filter but it doesn't help... Could it be a fonts problem? – Francesco Mar 19 '14 at 13:41
The charFilter brings no improvement too :( – Francesco Mar 21 '14 at 11:38
Mail the solr user group. The core Solr committers are more responsive there :) – arun Mar 21 '14 at 13:59

Indexing only "readable/parsable" text from pdf

1 Answers1