0

The goal of this code is to list an array of arrays containing object words of the length of it's index, which also contain an array of arrays of subwords that contain subwords of the length of it's index as well. These subwords are words that you can spell with the characters of the word itself. I'm trying to optimize my code so it can process about 81,000 words at a faster rate. The current program runs and will complete the process but it will take several hours to complete. I'm struggling to find where in my code can bested changed to decrease the time needed to run.

This is an example words.txt file.

do
dog
eat
go
god
goo
good
tea

Below is the expected subwords.json output from the txt file above.

[
    [],
    [],
    [
        {
            "word": "do",
            "subwords": [
                [],
                [],
                []
            ]
        },
        {
            "word": "go",
            "subwords": [
                [],
                [],
                []
            ]
        }
    ],
    [
        {
            "word": "dog",
            "subwords": [
                [],
                [],
                [
                    "do",
                    "go"
                ],
                [
                    "god"
                ]
            ]
        },
        {
            "word": "eat",
            "subwords": [
                [],
                [],
                [],
                [
                    "tea"
                ]
            ]
        },
        {
            "word": "god",
            "subwords": [
                [],
                [],
                [
                    "do",
                    "go"
                ],
                [
                    "dog"
                ]
            ]
        },
        {
            "word": "goo",
            "subwords": [
                [],
                [],
                [
                    "go"
                ],
                []
            ]
        },
        {
            "word": "tea",
            "subwords": [
                [],
                [],
                [],
                [
                    "eat"
                ]
            ]
        }
    ]
]

Below is the code that is running, calling the words.txt and outputting the subwords.json

const fs = require('fs')

const totalTalkText = fs.readFileSync('words.txt').toString()

const sortedWords = totalTalkText
  .toLowerCase()
  .split(/[^a-zA-Z]+/g)
  .sort()
  .sort((a, b) => {
    return a.length - b.length
  })

const finalData = new Array()

for (i = 0; i < sortedWords[0].length; i++) {
  finalData[i] = []
}

sortedWords.forEach((word, index) => {
  const subwordsArray = new Array()
  let wordObject = {
    word: word
  }

  if (finalData.indexOf(word.length) === -1) {
    finalData[word.length] = new Array()
  }

  sortedWords.some(subword => {
    if (subword.length > word.length) return

    if (subwordsArray.indexOf(subword.length) === -1) {
      subwordsArray[subword.length] = new Array()

      for (i = 0; i < sortedWords[0].length; i++) {
        subwordsArray[i] = []
      }
    }

    if (subword !== '' && word !== subword && isSubword(word, subword)) {
      subwordsArray[subword.length].push(subword)
    }
  })

  wordObject.subwords = subwordsArray

  finalData[word.length].push(wordObject)
  //console.log(`${word}: ${index / 814.88}%`) //This is mostly to help me gauge how long the program would take
})

fs.writeFileSync('subwords.json', JSON.stringify([...finalData]))

function isSubword(word, subword) {
  let tmpArray = new Array(256)

  for (let i = 0; i < 256; i++) tmpArray[i] = 0

  for (let i in word) tmpArray[word.charCodeAt(i)] += 1

  for (let i in subword) {
    tmpArray[subword.charCodeAt(i)] -= 1
    if (tmpArray[subword.charCodeAt(i)] < 0) return false
  }
  return true
}

I have been trying to find a better way to check if a word is a subword in the isSubword function but have had no success. Again, the code should function given a long period of time, I am just trying to get it to run significantly faster. Any help would be greatly appreciated!

RockyS
  • 59
  • 5
  • [Don't use `for…in` enumerations on arrays](https://stackoverflow.com/q/500504/1048572) - and even less on strings! – Bergi Sep 28 '19 at 00:27
  • You should cache the `tmpArray` for each of the words so that you can compare them directly. And then, instead of comparing them in a nested loop, try building a nested decision tree, keyed by count and nested by letter (depth in alphabetical order). That way you can quickly look up possible candidates. – Bergi Sep 28 '19 at 00:32

1 Answers1

0

For just the isSubword function I think you could speed things up by using a while loop (which is breakable) instead of for loops (which will always run the predetermined number of times). Here's an example where we break the word and subword into an array of characters, and then inside a while loop we check if each character of the subword is in the the word, and then remove it from the word array if it is. As you can see in the console log, the while loop will stop iterating if it finds a character that isn't contained in the word.

function isSubword(word, subword) {
  const wordArray = word.split("");
  const subwordArray = subword.split("");
  let isSubword = true;
  let i = 0;
  while(i < subwordArray.length && isSubword){
    const matchIndex = wordArray.findIndex(l => l === subwordArray[i] );
    console.log(subwordArray[i]);
    if(matchIndex < 0){
      isSubword = false;
    } else {
      wordArray.splice(matchIndex, 1);
      i += 1;
    }
  }
  return isSubword;
}

console.log(isSubword('test', 'set'));
console.log(isSubword('test', 'if'));
console.log(isSubword('test', 'tt'));
console.log(isSubword('test', 'ttt'));
Maria
  • 295
  • 2
  • 11