1

Hello fellow developers,

I am trying to build a chatbot using markov chains and I am stuck at a problem. I the code below, I have made a random sentence generator that learns from movie scripts. The problem is, how do I get this sentence generator to not be random and to respond to the user's input? How should I go about doing this? Is it something to do with input/output training like this:

In: how are you today
Out: I'm good thanks how are you

Here is my code. Most of the functions are used to put the data in a csv file so don't mind those.

from collections import defaultdict
import random, itertools, nltk, pandas, csv, string, re, os, time

class Chatbot:
    def __init__(self, name, txt_transcript_filedir, character=None):
        self.name = name
        self.txt_transcript_filedir = txt_transcript_filedir
        self.character = character
        print("Hello my name is " + name + ".")

    def parse_transcript(self):
        parsed_lines = []
        self.csv_transcript_filedir = self.txt_transcript_filedir.replace('.txt', '.csv')

        with open(self.txt_transcript_filedir, encoding='utf-8') as txt_file:
            lines = txt_file.readlines()
            for line in lines:
                line = line.replace(', ', ' ')
                line = re.sub(r'\[.*?\]', '', line)
                if ': ' in line:
                    line = line.replace(': ', ',')
                parsed_lines.append(line)

        with open(self.csv_transcript_filedir, 'w', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(['person', 'text'])
            for line in parsed_lines:
                csv_file.write(line)

    def tokenize_transcript(self):
        csv_file = pandas.read_csv(self.csv_transcript_filedir)
        textss = []
        final_sents = []

        if self.character == None:
            texts = csv_file['text']
            for text in texts:
                sent = nltk.sent_tokenize(text)
                textss.append(sent)
        else:
            char_sets = csv_file[csv_file['person'] == self.character]
            texts = char_sets['text']
            for text in texts:
                sent = nltk.sent_tokenize(text)
                textss.append(sent)

        for text in textss:
            for sent in text:
                if sent[0] == ' ':
                    sent = sent[1:]
                final_sent = [w for w in sent if w not in string.punctuation]
                final_sent = ''.join(final_sent)
                final_sents.append(final_sent)

        self.training_data = [sent for sent in final_sents]

    def learn(self):
        self.parse_transcript()
        self.tokenize_transcript()
        self.make_word_dict(self.training_data)

    def make_word_dict(self, text):
        word_dict = defaultdict(list)

        for sent in text:
            words = nltk.word_tokenize(sent)

            for i in range(len(words) - 1):
                if i+2 >= (len(words)):
                    word_dict[(words[i], words[i+1])].append('<end>')
                else:
                    word_dict[(words[i], words[i+1])].append(words[i+2])

        self.vocabulary = word_dict

    def generate_text(self, num):
        for i in range(0, num):
            start_key = random.choice(list(self.vocabulary.keys()))
            text = []
            text.append(start_key[0])
            text.append(start_key[1])

            for i in itertools.count():
                key = (text[i], text[i+1])
                if key[1] == '<end>':
                    break
                else:
                    text.append(random.choice(self.vocabulary[text[i], text[i+1]]))

            text = ' '.join(text)

            if text.endswith('<end>'):
                text = text[:-6]
                text = text + '.'

            return text

    def say(self, text):
        os.system('say -v Oliver ' + text)


def main():
    num = 100

    bot = Chatbot("J.A.R.V.I.S", "avengers_age_of_ultron.txt", "JARVIS")
    bot.learn()

    for i in range(num):
        text = bot.generate_text(1)
        print(text)

if __name__ == '__main__':
    main()
N. Chalifour
  • 103
  • 10
  • A Markov Chain consists of a set of states and the transition probability between these states; hence there is no concept of 'memory', which is what you need if you would like your responses to not be random. – Radix Aug 18 '16 at 19:06
  • Also have a look at this: http://stackoverflow.com/questions/5306729/how-do-markov-chain-chatbots-work – Radix Aug 18 '16 at 19:07
  • Then how do I go about adding memory? Using a hidden Markov model? @Radix – N. Chalifour Aug 18 '16 at 20:16
  • really depends on what your goal is. If you are just aiming to create a chatbot and you don't care about the underlying algorithm, then I suggest LSTM ANN as an alternative. – Radix Aug 19 '16 at 15:45
  • and how would I go about using a LSTM neural network in this case? @Radix – N. Chalifour Aug 19 '16 at 17:45
  • have a look at https://keras.io/ , and some of the examples on the github page. There's also this: http://www.wildml.com/2016/04/deep-learning-for-chatbots-part-1-introduction/ – Radix Aug 19 '16 at 18:43
  • Ok thanks will do @Radix – N. Chalifour Aug 19 '16 at 19:03

0 Answers0