Skip to content

Commit b627cf6

Browse files
committed
feature: added src
1 parent f307816 commit b627cf6

File tree

3 files changed

+260
-0
lines changed

3 files changed

+260
-0
lines changed

dialogue_manager.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import os
2+
3+
from chatterbot.trainers import ChatterBotCorpusTrainer
4+
from sklearn.metrics.pairwise import pairwise_distances_argmin
5+
6+
from chatterbot import ChatBot
7+
from utils import *
8+
9+
10+
class ThreadRanker(object):
11+
def __init__(self, paths):
12+
self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS'])
13+
self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER']
14+
15+
def __load_embeddings_by_tag(self, tag_name):
16+
embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl")
17+
thread_ids, thread_embeddings = unpickle_file(embeddings_path)
18+
return thread_ids, thread_embeddings
19+
20+
def get_best_thread(self, question, tag_name):
21+
""" Returns id of the most similar thread for the question.
22+
The search is performed across the threads with a given tag.
23+
"""
24+
thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
25+
26+
question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim)
27+
best_thread = pairwise_distances_argmin(X=[question_vec], Y=thread_embeddings, metric='cosine')[0]
28+
29+
return thread_ids[best_thread]
30+
31+
32+
class DialogueManager(object):
33+
def __init__(self, paths=RESOURCE_PATH):
34+
print("Loading resources...")
35+
36+
# Intent recognition:
37+
self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER'])
38+
self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER'])
39+
40+
self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s'
41+
42+
# Goal-oriented part:
43+
self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER'])
44+
self.thread_ranker = ThreadRanker(paths)
45+
46+
self.create_chitchat_bot()
47+
48+
def create_chitchat_bot(self):
49+
"""Initializes self.chitchat_bot with some conversational model."""
50+
51+
# Hint: you might want to create and train chatterbot.ChatBot here.
52+
# It could be done by creating ChatBot with the *trainer* parameter equals
53+
# "chatterbot.trainers.ChatterBotCorpusTrainer"
54+
# and then calling *train* function with "chatterbot.corpus.english" param
55+
self.chitchat_bot = ChatBot('Aleksei')
56+
57+
self.chitchat_bot.set_trainer(ChatterBotCorpusTrainer)
58+
self.chitchat_bot.train("chatterbot.corpus.english")
59+
60+
def generate_answer(self, question):
61+
"""Combines stackoverflow and chitchat parts using intent recognition."""
62+
63+
prepared_question = text_prepare(question)
64+
features = self.tfidf_vectorizer.transform([prepared_question])
65+
intent = self.intent_recognizer.predict(features)[0]
66+
67+
# Chit-chat part:
68+
if intent == 'dialogue':
69+
response = self.chitchat_bot.get_response(question)
70+
return response
71+
72+
# Goal-oriented part:
73+
else:
74+
tag = self.tag_classifier.predict(features)[0]
75+
thread_id = self.thread_ranker.get_best_thread(prepared_question, tag)
76+
return self.ANSWER_TEMPLATE % (tag, thread_id)

main_bot.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/usr/bin/env python3
2+
3+
import requests
4+
import time
5+
import argparse
6+
import os
7+
import json
8+
9+
from requests.compat import urljoin
10+
from dialogue_manager import DialogueManager
11+
12+
13+
class BotHandler(object):
14+
"""
15+
BotHandler is a class which implements all back-end of the bot.
16+
It has tree main functions:
17+
'get_updates' — checks for new messages
18+
'send_message' – posts new message to user
19+
'get_answer' — computes the most relevant on a user's question
20+
"""
21+
22+
def __init__(self, token, dialogue_manager):
23+
self.token = token
24+
self.api_url = "https://api.telegram.org/bot{}/".format(token)
25+
self.dialogue_manager = dialogue_manager
26+
27+
def get_updates(self, offset=None, timeout=30):
28+
params = {"timeout": timeout, "offset": offset}
29+
raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params)
30+
try:
31+
resp = raw_resp.json()
32+
except json.decoder.JSONDecodeError as e:
33+
print("Failed to parse response {}: {}.".format(raw_resp.content, e))
34+
return []
35+
36+
if "result" not in resp:
37+
return []
38+
return resp["result"]
39+
40+
def send_message(self, chat_id, text):
41+
params = {"chat_id": chat_id, "text": text}
42+
return requests.post(urljoin(self.api_url, "sendMessage"), params)
43+
44+
def get_answer(self, question):
45+
if question == '/start':
46+
return "Hi, I am your project bot. How can I help you today?"
47+
return self.dialogue_manager.generate_answer(question)
48+
49+
50+
def parse_args():
51+
parser = argparse.ArgumentParser()
52+
parser.add_argument('--token', type=str, default='')
53+
return parser.parse_args()
54+
55+
56+
def is_unicode(text):
57+
return len(text) == len(text.encode())
58+
59+
60+
class SimpleDialogueManager(object):
61+
"""
62+
This is the simplest dialogue manager to test the telegram bot.
63+
Your task is to create a more advanced one in dialogue_manager.py."
64+
"""
65+
66+
def generate_answer(self, question):
67+
return "Hello, world!"
68+
69+
70+
def main():
71+
args = parse_args()
72+
token = args.token
73+
74+
if not token:
75+
if not "TELEGRAM_TOKEN" in os.environ:
76+
print("Please, set bot token through --token or TELEGRAM_TOKEN env variable")
77+
return
78+
token = os.environ["TELEGRAM_TOKEN"]
79+
80+
manager = DialogueManager()
81+
bot = BotHandler(token, manager)
82+
83+
print("Ready to talk!")
84+
offset = 0
85+
while True:
86+
updates = bot.get_updates(offset=offset)
87+
for update in updates:
88+
print("An update received.")
89+
if "message" in update:
90+
chat_id = update["message"]["chat"]["id"]
91+
if "text" in update["message"]:
92+
text = update["message"]["text"]
93+
if is_unicode(text):
94+
print("Update content: {}".format(update))
95+
bot.send_message(chat_id, bot.get_answer(update["message"]["text"]))
96+
else:
97+
bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...")
98+
offset = max(offset, update['update_id'] + 1)
99+
time.sleep(1)
100+
101+
102+
if __name__ == "__main__":
103+
main()

utils.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import nltk
2+
import pickle
3+
import re
4+
import numpy as np
5+
6+
nltk.download('stopwords')
7+
from nltk.corpus import stopwords
8+
9+
# Paths for all resources for the bot.
10+
RESOURCE_PATH = {
11+
'INTENT_RECOGNIZER': 'models/intent_recognizer.pkl',
12+
'TAG_CLASSIFIER': 'models/tag_classifier.pkl',
13+
'TFIDF_VECTORIZER': 'models/tfidf_vectorizer.pkl',
14+
'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags',
15+
'WORD_EMBEDDINGS': 'models/word_embeddings.tsv',
16+
}
17+
18+
19+
def text_prepare(text):
20+
"""Performs tokenization and simple preprocessing."""
21+
22+
replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
23+
bad_symbols_re = re.compile('[^0-9a-z #+_]')
24+
stopwords_set = set(stopwords.words('english'))
25+
26+
text = text.lower()
27+
text = replace_by_space_re.sub(' ', text)
28+
text = bad_symbols_re.sub('', text)
29+
text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
30+
31+
return text.strip()
32+
33+
34+
def load_embeddings(embeddings_path):
35+
"""Loads pre-trained word embeddings from tsv file.
36+
37+
Args:
38+
embeddings_path - path to the embeddings file.
39+
40+
Returns:
41+
embeddings - dict mapping words to vectors;
42+
embeddings_dim - dimension of the vectors.
43+
"""
44+
embeddings = dict()
45+
embeddings_dim = 100
46+
47+
with open(embeddings_path, 'r') as ss_tsv:
48+
for line in ss_tsv:
49+
key = line.split('\t')[0]
50+
value = np.array(line.strip().split('\t')[1:], dtype=np.float32)
51+
52+
if len(value) != embeddings_dim:
53+
continue
54+
55+
embeddings[key] = value
56+
57+
return embeddings, embeddings_dim
58+
59+
60+
def question_to_vec(question, embeddings, dim=300):
61+
"""
62+
question: a string
63+
embeddings: dict where the key is a word and a value is its' embedding
64+
dim: size of the representation
65+
66+
result: vector representation for the question
67+
"""
68+
question_embeddings = np.zeros(dim, dtype=np.float32)
69+
n_words = 0
70+
for idx, word in enumerate(question.split(' ')):
71+
if word in embeddings:
72+
n_words += 1
73+
question_embeddings += embeddings[word]
74+
75+
return question_embeddings if n_words == 0 else question_embeddings / n_words
76+
77+
78+
def unpickle_file(filename):
79+
"""Returns the result of unpickling the file content."""
80+
with open(filename, 'rb') as f:
81+
return pickle.load(f)

0 commit comments

Comments
 (0)