From fdaf4bf0cc30f0b62450703a60f92e244bf51d7c Mon Sep 17 00:00:00 2001 From: mebarkiyacine Date: Sun, 15 Mar 2026 17:25:00 +0100 Subject: [PATCH 1/3] implimenting tfidf using just python and numpy under feature extraction --- machine_learning/feature_extraction/tf-idf.py | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 machine_learning/feature_extraction/tf-idf.py diff --git a/machine_learning/feature_extraction/tf-idf.py b/machine_learning/feature_extraction/tf-idf.py new file mode 100644 index 000000000000..b69fc6089623 --- /dev/null +++ b/machine_learning/feature_extraction/tf-idf.py @@ -0,0 +1,112 @@ +import numpy as np +import re +#to seprate words and normlize it + +def decompose(text): + text = text.lower() + text = re.sub(r'[^a-z0-9\s]', '', text) + text = re.sub(r'\s+', ' ', text) + + return text.split() + + +#creating tfidf class +class TfIdfVectorizer: + + def __init__(self): + self.vocab=None + self.idf=None + + #these method to compute the tf for each word in given data + def compute_tf(self,data): + + tf = [] + doc_words = [] + + for document in data: + + words = decompose(document) + + freq = {} #these dictionerie have for each unique words it number of apprition in one sentence + + for word in words: + + freq[word] = freq.get(word , 0)+1 + + if word not in doc_words: + doc_words.append(word) + + #calculating tf + + for word in freq : + freq[word] /= len(words) + + tf.append(freq) + + + #computing idf + idf = {} + + n = len(data) + + for word in doc_words: + df = sum(1 for doc in tf if word in doc) + idf[word] = np.log((n + 1) / (1 + df)) + 1 + + self.idf=idf + tfidf = [] + + self.idf = idf + + #computing tfidf for each word + + + for doc_tf in tf: + vector = [doc_tf.get(word, 0) * idf[word] for word in doc_words] + tfidf.append(vector) + + self.vocab = doc_words + + return np.array(tfidf,dtype=float) + + def encode(self, data): + if self.vocab is None or self.idf is None: + raise ValueError("You should fit the model first") + + tfidf_matrix = [] + for doc in data: + words = decompose(doc) + freq = {} + + # Count term frequencies for words that exist in the vocabulary + for word in words: + + if word in self.vocab: + freq[word] = freq.get(word, 0) + 1 + + # Normalize TF by document length + for word in freq: + freq[word] /= len(words) + + # Align vector according to vocab and multiply by IDF + vector = [freq.get(word, 0) * self.idf[word] for word in self.vocab] + tfidf_matrix.append(vector) + + return np.array(tfidf_matrix, dtype=float) + + +if __name__ == "__main__": + documents = ["the cat sat on the mat", "the dog chased the cat"] + vectorizer = TfIdfVectorizer() + tfidf_matrix = vectorizer.compute_tf(documents) + print("Vocabulary:", vectorizer.vocab) + print("TF-IDF Matrix:\n", tfidf_matrix) + + + + + + + + + \ No newline at end of file From f88e0acd5cfbaf1783cea3294fe8759cbbe2e62b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 15 Mar 2026 16:29:07 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/feature_extraction/tf-idf.py | 103 ++++++++---------- 1 file changed, 44 insertions(+), 59 deletions(-) diff --git a/machine_learning/feature_extraction/tf-idf.py b/machine_learning/feature_extraction/tf-idf.py index b69fc6089623..0d21509a7b1e 100644 --- a/machine_learning/feature_extraction/tf-idf.py +++ b/machine_learning/feature_extraction/tf-idf.py @@ -1,74 +1,69 @@ import numpy as np import re -#to seprate words and normlize it +# to seprate words and normlize it + def decompose(text): text = text.lower() - text = re.sub(r'[^a-z0-9\s]', '', text) - text = re.sub(r'\s+', ' ', text) - + text = re.sub(r"[^a-z0-9\s]", "", text) + text = re.sub(r"\s+", " ", text) + return text.split() - -#creating tfidf class + +# creating tfidf class class TfIdfVectorizer: - def __init__(self): - self.vocab=None - self.idf=None - - #these method to compute the tf for each word in given data - def compute_tf(self,data): - + self.vocab = None + self.idf = None + + # these method to compute the tf for each word in given data + def compute_tf(self, data): tf = [] doc_words = [] - + for document in data: - words = decompose(document) - - freq = {} #these dictionerie have for each unique words it number of apprition in one sentence - + + freq = {} # these dictionerie have for each unique words it number of apprition in one sentence + for word in words: - - freq[word] = freq.get(word , 0)+1 - + freq[word] = freq.get(word, 0) + 1 + if word not in doc_words: doc_words.append(word) - - #calculating tf - - for word in freq : + + # calculating tf + + for word in freq: freq[word] /= len(words) - + tf.append(freq) - - - #computing idf + + # computing idf idf = {} - + n = len(data) - + for word in doc_words: df = sum(1 for doc in tf if word in doc) idf[word] = np.log((n + 1) / (1 + df)) + 1 - - self.idf=idf + + self.idf = idf tfidf = [] - + self.idf = idf - - #computing tfidf for each word - - + + # computing tfidf for each word + for doc_tf in tf: vector = [doc_tf.get(word, 0) * idf[word] for word in doc_words] tfidf.append(vector) - + self.vocab = doc_words - - return np.array(tfidf,dtype=float) - + + return np.array(tfidf, dtype=float) + def encode(self, data): if self.vocab is None or self.idf is None: raise ValueError("You should fit the model first") @@ -77,36 +72,26 @@ def encode(self, data): for doc in data: words = decompose(doc) freq = {} - + # Count term frequencies for words that exist in the vocabulary for word in words: - if word in self.vocab: freq[word] = freq.get(word, 0) + 1 - + # Normalize TF by document length for word in freq: freq[word] /= len(words) - + # Align vector according to vocab and multiply by IDF vector = [freq.get(word, 0) * self.idf[word] for word in self.vocab] tfidf_matrix.append(vector) - return np.array(tfidf_matrix, dtype=float) - - + return np.array(tfidf_matrix, dtype=float) + + if __name__ == "__main__": documents = ["the cat sat on the mat", "the dog chased the cat"] vectorizer = TfIdfVectorizer() tfidf_matrix = vectorizer.compute_tf(documents) print("Vocabulary:", vectorizer.vocab) - print("TF-IDF Matrix:\n", tfidf_matrix) - - - - - - - - - \ No newline at end of file + print("TF-IDF Matrix:\n", tfidf_matrix) From eed987389905a68278af482678f7456427762198 Mon Sep 17 00:00:00 2001 From: mebarkiyacine Date: Sun, 15 Mar 2026 17:42:50 +0100 Subject: [PATCH 3/3] implimenting tfidf using just python and numpy under feature extraction --- machine_learning/feature_extraction/{tf-idf.py => tfidf.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename machine_learning/feature_extraction/{tf-idf.py => tfidf.py} (100%) diff --git a/machine_learning/feature_extraction/tf-idf.py b/machine_learning/feature_extraction/tfidf.py similarity index 100% rename from machine_learning/feature_extraction/tf-idf.py rename to machine_learning/feature_extraction/tfidf.py