Part of the managed-schema.xml
<field name="uid" type="plong" multiValued="false" indexed="true" required="true" stored="true"/>
<field name="ingredients" type="string" indexed="true" stored="true" required="false" multiValued="false" termVectors="true" useDocValuesAsStored="true" termPositions="true" termOffsets="false" omitNorms="true" omitTermFreqAndPositions="true" omitPositions="true"/>
<field name="ingredients_mutations" type="ingredients_idx" indexed="true" stored="true" multiValued="false" useDocValuesAsStored="true" termVectors="true" termPositions="true" termPayloads="true" termOffsets="true" omitNorms="true"/>
<fieldType name="ingredients_idx" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<filter class="solr.TrimFilterFactory"/>
<charFilter class="com.ourclass.solr.filter.OurReplaceFilterFactory" />
<!-- remove extra characters and tokenize values -->
<tokenizer class="com.ourclass.solr.tokenizer.OurSplitTokenizerFactory" />
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="false" />
<filter class="solr.LowerCaseFilterFactory"/>
<!-- generate string variations and put max token and number of the token that build a phrase into payload -->
<!-- apple banana orange will for example generate multiple termvector values and may like this orangebanana 2|3 orangebananaapple 3|3 apple 1|3-->
<filter class="com.ourclass.filter.OurTokenFilterFactory" minTokenSize="1" maxTokenSize="4" addPayload="true" eliminateDuplicates="false"/>
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="integer" delimiter="|"/>
</analyzer>
<analyzer type="query">
<filter class="solr.TrimFilterFactory"/>
<charFilter class="com.ourclass.solr.filter.OurReplaceFilterFactory" />
<tokenizer class="com.ourclass.solr.tokenizer.OurSplitTokenizerFactory" />
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="false" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
part of solrconfig.xml
<processor class="solr.CloneFieldUpdateProcessorFactory">
<str name="source"ingredientse</str>
<str name="dest">ingredients_mutations</str>
</processor>
Example Document
{
"uid":"juice_1",
“ingredients":["Apple","Avocado","Beetroot","Carrot","Lime","Cherry","Cranberry","Coconut water","Coconut milk",...]
“ingredients_count”:[5]
“ingredients_mutations”: "apple | 1","avocado | 1","beetroot | 1","carrot | 1","lime | 1","cherry | 1","cranberry | 1","coconutwater | 1","coconutmilk | 1","appleavocado | 2","applebeetroot | 2","applecarrot | 2","applelime | 2","applecherry | 2","applecranberry | 2","applecoconutwater | 2","applecoconutmilk | 2","appleavocadobeetroot | 3","appleavocadocarrot | 3","appleavocadolime | 3","appleavocadocherry | 3","appleavocadocranberry | 3","appleavocadococonutwater | 3","appleavocadococonutmilk | 3","appleavocadobeetrootcarrot | 4","appleavocadobeetrootlime | 4","appleavocadobeetrootcherry | 4","appleavocadobeetrootcranberry | 4","appleavocadobeetrootcoconutwater | 4","appleavocadobeetrootcoconutmilk | 4","appleavocadobeetrootcarrotlime | 5","appleavocadobeetrootcarrotcherry | 5","appleavocadobeetrootcarrotcranberry | 5","appleavocadobeetrootcarrotcoconutwater | 5","appleavocadobeetrootcarrotcoconutmilk | 5“, ...
}
Example query
q={!frange l=90}mydist('ingredients_mutations','bananaapple'))
defType=lucene
q.op=AND
fq=(
{!cache=false cost=200}ingredients_mutations:'bananaapple' OR
{!cache=false cost=200}ingredients_mutations:'bananaapple~2')
start=0
rows=100
fl=*,score,mydist:mydist('ingredients_mutations','bananaapple')
sort=max(mydist('ingredients_mutations','bananaapple')) desc, id asc
Example result
"uid":"juice_1",
…
I need the matched terms and the payload to calculate the score in my function.
How can I solve this?
Currently, I iterate through the mutations and calculate the distance using Levenstein. This takes too long.