feature: added src

IdeoG · IdeoG · commit b627cf6d8573 · 2019-11-12T18:19:33.000+03:00
diff --git a/dialogue_manager.py b/dialogue_manager.py
@@ -0,0 +1,76 @@
+import os
+
+from chatterbot.trainers import ChatterBotCorpusTrainer
+from sklearn.metrics.pairwise import pairwise_distances_argmin
+
+from chatterbot import ChatBot
+from utils import *
+
+
+class ThreadRanker(object):
+    def __init__(self, paths):
+        self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS'])
+        self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER']
+
+    def __load_embeddings_by_tag(self, tag_name):
+        embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl")
+        thread_ids, thread_embeddings = unpickle_file(embeddings_path)
+        return thread_ids, thread_embeddings
+
+    def get_best_thread(self, question, tag_name):
+        """ Returns id of the most similar thread for the question.
+            The search is performed across the threads with a given tag.
+        """
+        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
+
+        question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim)
+        best_thread = pairwise_distances_argmin(X=[question_vec], Y=thread_embeddings, metric='cosine')[0]
+
+        return thread_ids[best_thread]
+
+
+class DialogueManager(object):
+    def __init__(self, paths=RESOURCE_PATH):
+        print("Loading resources...")
+
+        # Intent recognition:
+        self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER'])
+        self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER'])
+
+        self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s'
+
+        # Goal-oriented part:
+        self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER'])
+        self.thread_ranker = ThreadRanker(paths)
+
+        self.create_chitchat_bot()
+
+    def create_chitchat_bot(self):
+        """Initializes self.chitchat_bot with some conversational model."""
+
+        # Hint: you might want to create and train chatterbot.ChatBot here.
+        # It could be done by creating ChatBot with the *trainer* parameter equals
+        # "chatterbot.trainers.ChatterBotCorpusTrainer"
+        # and then calling *train* function with "chatterbot.corpus.english" param
+        self.chitchat_bot = ChatBot('Aleksei')
+
+        self.chitchat_bot.set_trainer(ChatterBotCorpusTrainer)
+        self.chitchat_bot.train("chatterbot.corpus.english")
+
+    def generate_answer(self, question):
+        """Combines stackoverflow and chitchat parts using intent recognition."""
+
+        prepared_question = text_prepare(question)
+        features = self.tfidf_vectorizer.transform([prepared_question])
+        intent = self.intent_recognizer.predict(features)[0]
+
+        # Chit-chat part:
+        if intent == 'dialogue':
+            response = self.chitchat_bot.get_response(question)
+            return response
+
+        # Goal-oriented part:
+        else:
+            tag = self.tag_classifier.predict(features)[0]
+            thread_id = self.thread_ranker.get_best_thread(prepared_question, tag)
+            return self.ANSWER_TEMPLATE % (tag, thread_id)
diff --git a/main_bot.py b/main_bot.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+import requests
+import time
+import argparse
+import os
+import json
+
+from requests.compat import urljoin
+from dialogue_manager import DialogueManager
+
+
+class BotHandler(object):
+    """
+        BotHandler is a class which implements all back-end of the bot.
+        It has tree main functions:
+            'get_updates' — checks for new messages
+            'send_message' – posts new message to user
+            'get_answer' — computes the most relevant on a user's question
+    """
+
+    def __init__(self, token, dialogue_manager):
+        self.token = token
+        self.api_url = "https://api.telegram.org/bot{}/".format(token)
+        self.dialogue_manager = dialogue_manager
+
+    def get_updates(self, offset=None, timeout=30):
+        params = {"timeout": timeout, "offset": offset}
+        raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params)
+        try:
+            resp = raw_resp.json()
+        except json.decoder.JSONDecodeError as e:
+            print("Failed to parse response {}: {}.".format(raw_resp.content, e))
+            return []
+
+        if "result" not in resp:
+            return []
+        return resp["result"]
+
+    def send_message(self, chat_id, text):
+        params = {"chat_id": chat_id, "text": text}
+        return requests.post(urljoin(self.api_url, "sendMessage"), params)
+
+    def get_answer(self, question):
+        if question == '/start':
+            return "Hi, I am your project bot. How can I help you today?"
+        return self.dialogue_manager.generate_answer(question)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--token', type=str, default='')
+    return parser.parse_args()
+
+
+def is_unicode(text):
+    return len(text) == len(text.encode())
+
+
+class SimpleDialogueManager(object):
+    """
+    This is the simplest dialogue manager to test the telegram bot.
+    Your task is to create a more advanced one in dialogue_manager.py."
+    """
+
+    def generate_answer(self, question):
+        return "Hello, world!"
+
+
+def main():
+    args = parse_args()
+    token = args.token
+
+    if not token:
+        if not "TELEGRAM_TOKEN" in os.environ:
+            print("Please, set bot token through --token or TELEGRAM_TOKEN env variable")
+            return
+        token = os.environ["TELEGRAM_TOKEN"]
+
+    manager = DialogueManager()
+    bot = BotHandler(token, manager)
+
+    print("Ready to talk!")
+    offset = 0
+    while True:
+        updates = bot.get_updates(offset=offset)
+        for update in updates:
+            print("An update received.")
+            if "message" in update:
+                chat_id = update["message"]["chat"]["id"]
+                if "text" in update["message"]:
+                    text = update["message"]["text"]
+                    if is_unicode(text):
+                        print("Update content: {}".format(update))
+                        bot.send_message(chat_id, bot.get_answer(update["message"]["text"]))
+                    else:
+                        bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...")
+            offset = max(offset, update['update_id'] + 1)
+        time.sleep(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils.py b/utils.py
@@ -0,0 +1,81 @@
+import nltk
+import pickle
+import re
+import numpy as np
+
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+
+# Paths for all resources for the bot.
+RESOURCE_PATH = {
+    'INTENT_RECOGNIZER': 'models/intent_recognizer.pkl',
+    'TAG_CLASSIFIER': 'models/tag_classifier.pkl',
+    'TFIDF_VECTORIZER': 'models/tfidf_vectorizer.pkl',
+    'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags',
+    'WORD_EMBEDDINGS': 'models/word_embeddings.tsv',
+}
+
+
+def text_prepare(text):
+    """Performs tokenization and simple preprocessing."""
+
+    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
+    bad_symbols_re = re.compile('[^0-9a-z #+_]')
+    stopwords_set = set(stopwords.words('english'))
+
+    text = text.lower()
+    text = replace_by_space_re.sub(' ', text)
+    text = bad_symbols_re.sub('', text)
+    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
+
+    return text.strip()
+
+
+def load_embeddings(embeddings_path):
+    """Loads pre-trained word embeddings from tsv file.
+
+    Args:
+      embeddings_path - path to the embeddings file.
+
+    Returns:
+      embeddings - dict mapping words to vectors;
+      embeddings_dim - dimension of the vectors.
+    """
+    embeddings = dict()
+    embeddings_dim = 100
+
+    with open(embeddings_path, 'r') as ss_tsv:
+        for line in ss_tsv:
+            key = line.split('\t')[0]
+            value = np.array(line.strip().split('\t')[1:], dtype=np.float32)
+
+            if len(value) != embeddings_dim:
+                continue
+
+            embeddings[key] = value
+
+    return embeddings, embeddings_dim
+
+
+def question_to_vec(question, embeddings, dim=300):
+    """
+        question: a string
+        embeddings: dict where the key is a word and a value is its' embedding
+        dim: size of the representation
+
+        result: vector representation for the question
+    """
+    question_embeddings = np.zeros(dim, dtype=np.float32)
+    n_words = 0
+    for idx, word in enumerate(question.split(' ')):
+        if word in embeddings:
+            n_words += 1
+            question_embeddings += embeddings[word]
+
+    return question_embeddings if n_words == 0 else question_embeddings / n_words
+
+
+def unpickle_file(filename):
+    """Returns the result of unpickling the file content."""
+    with open(filename, 'rb') as f:
+        return pickle.load(f)