I have 13,000 webpages with their body texts indexed. The goal is to get the top 200 phrase frequencies for one word, two word, three word... up till eight word phrases.
There are a total of over 150 million words from these webpages that need to be tokenized.
The problem is that the query takes about 15 minutes, after which it runs out of heap space, failing to complete.
I'm testing this on a 4 cpu core, 8GB RAM, SSD ubuntu server. 6GB of RAM is assigned as heap. Swap is disabled.
Now, I can do this by splitting into 8 different indices, the query / settings / mapping combination works for single-type word phrases. That is, I can run this on one-word-phrases, two-word phrases, etc alone where I get the result that I expect (though that still takes about 5 minutes each). I was wondering if there was a way to tune this full aggregation to work with my hardware with one index and query.
Settings and mappings:
{
"settings":{
"index":{
"number_of_shards" : 1,
"number_of_replicas" : 0,
"analysis":{
"analyzer":{
"analyzer_shingle_2":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "filter_shingle_2"]
},
"analyzer_shingle_3":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "filter_shingle_3"]
},
"analyzer_shingle_4":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "filter_shingle_4"]
},
"analyzer_shingle_5":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "filter_shingle_5"]
},
"analyzer_shingle_6":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "filter_shingle_6"]
},
"analyzer_shingle_7":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "filter_shingle_7"]
},
"analyzer_shingle_8":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "filter_shingle_8"]
}
},
"filter":{
"filter_shingle_2":{
"type":"shingle",
"max_shingle_size":2,
"min_shingle_size":2,
"output_unigrams":"false"
},
"filter_shingle_3":{
"type":"shingle",
"max_shingle_size":3,
"min_shingle_size":3,
"output_unigrams":"false"
},
"filter_shingle_4":{
"type":"shingle",
"max_shingle_size":4,
"min_shingle_size":4,
"output_unigrams":"false"
},
"filter_shingle_5":{
"type":"shingle",
"max_shingle_size":5,
"min_shingle_size":5,
"output_unigrams":"false"
},
"filter_shingle_6":{
"type":"shingle",
"max_shingle_size":6,
"min_shingle_size":6,
"output_unigrams":"false"
},
"filter_shingle_7":{
"type":"shingle",
"max_shingle_size":7,
"min_shingle_size":7,
"output_unigrams":"false"
},
"filter_shingle_8":{
"type":"shingle",
"max_shingle_size":8,
"min_shingle_size":8,
"output_unigrams":"false"
}
}
}
}
},
"mappings":{
"items":{
"properties":{
"body":{
"type": "multi_field",
"fields": {
"two-word-phrases": {
"analyzer":"analyzer_shingle_2",
"type":"string"
},
"three-word-phrases": {
"analyzer":"analyzer_shingle_3",
"type":"string"
},
"four-word-phrases": {
"analyzer":"analyzer_shingle_4",
"type":"string"
},
"five-word-phrases": {
"analyzer":"analyzer_shingle_5",
"type":"string"
},
"six-word-phrases": {
"analyzer":"analyzer_shingle_6",
"type":"string"
},
"seven-word-phrases": {
"analyzer":"analyzer_shingle_7",
"type":"string"
},
"eight-word-phrases": {
"analyzer":"analyzer_shingle_8",
"type":"string"
}
}
}
}
}
}
}
Query:
{
"size" : 0,
"aggs" : {
"one-word-phrases" : {
"terms" : {
"field" : "body",
"size" : 200
}
},
"two-word-phrases" : {
"terms" : {
"field" : "body.two-word-phrases",
"size" : 200
}
},
"three-word-phrases" : {
"terms" : {
"field" : "body.three-word-phrases",
"size" : 200
}
},
"four-word-phrases" : {
"terms" : {
"field" : "body.four-word-phrases",
"size" : 200
}
},
"five-word-phrases" : {
"terms" : {
"field" : "body.five-word-phrases",
"size" : 200
}
},
"six-word-phrases" : {
"terms" : {
"field" : "body.six-word-phrases",
"size" : 200
}
},
"seven-word-phrases" : {
"terms" : {
"field" : "body.seven-word-phrases",
"size" : 200
}
},
"eight-word-phrases" : {
"terms" : {
"field" : "body.eight-word-phrases",
"size" : 200
}
}
}
}