Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions machine_learning/feature_extraction/tfidf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import numpy as np

Check failure on line 1 in machine_learning/feature_extraction/tfidf.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (INP001)

machine_learning/feature_extraction/tfidf.py:1:1: INP001 File `machine_learning/feature_extraction/tfidf.py` is part of an implicit namespace package. Add an `__init__.py`.
import re

Check failure on line 2 in machine_learning/feature_extraction/tfidf.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (I001)

machine_learning/feature_extraction/tfidf.py:1:1: I001 Import block is un-sorted or un-formatted help: Organize imports
# to seprate words and normlize it


def decompose(text):
text = text.lower()
text = re.sub(r"[^a-z0-9\s]", "", text)
text = re.sub(r"\s+", " ", text)

return text.split()


# creating tfidf class
class TfIdfVectorizer:
def __init__(self):
self.vocab = None
self.idf = None

# these method to compute the tf for each word in given data
def compute_tf(self, data):
tf = []
doc_words = []

for document in data:
words = decompose(document)

freq = {} # these dictionerie have for each unique words it number of apprition in one sentence

Check failure on line 28 in machine_learning/feature_extraction/tfidf.py

View workflow job for this annotation

GitHub Actions / ruff

ruff (E501)

machine_learning/feature_extraction/tfidf.py:28:89: E501 Line too long (108 > 88)

for word in words:
freq[word] = freq.get(word, 0) + 1

if word not in doc_words:
doc_words.append(word)

# calculating tf

for word in freq:
freq[word] /= len(words)

tf.append(freq)

# computing idf
idf = {}

n = len(data)

for word in doc_words:
df = sum(1 for doc in tf if word in doc)
idf[word] = np.log((n + 1) / (1 + df)) + 1

self.idf = idf
tfidf = []

self.idf = idf

# computing tfidf for each word

for doc_tf in tf:
vector = [doc_tf.get(word, 0) * idf[word] for word in doc_words]
tfidf.append(vector)

self.vocab = doc_words

return np.array(tfidf, dtype=float)

def encode(self, data):
if self.vocab is None or self.idf is None:
raise ValueError("You should fit the model first")

tfidf_matrix = []
for doc in data:
words = decompose(doc)
freq = {}

# Count term frequencies for words that exist in the vocabulary
for word in words:
if word in self.vocab:
freq[word] = freq.get(word, 0) + 1

# Normalize TF by document length
for word in freq:
freq[word] /= len(words)

# Align vector according to vocab and multiply by IDF
vector = [freq.get(word, 0) * self.idf[word] for word in self.vocab]
tfidf_matrix.append(vector)

return np.array(tfidf_matrix, dtype=float)


if __name__ == "__main__":
documents = ["the cat sat on the mat", "the dog chased the cat"]
vectorizer = TfIdfVectorizer()
tfidf_matrix = vectorizer.compute_tf(documents)
print("Vocabulary:", vectorizer.vocab)
print("TF-IDF Matrix:\n", tfidf_matrix)
Loading