windup · midays · Sep 19, 2023
diff --git a/src/utils/ocr.py b/src/utils/ocr.py
@@ -2,6 +2,7 @@
 
 import cv2
 import numpy as np
+import pandas as pd
 import pyautogui
 import pytesseract
 
@@ -19,10 +20,26 @@ def find_all_string_occurrences(string):
     # Preprocess the image
     processed_img = preprocess_image(screenshot_bgr)
     # Use pytesseract to extract text from the screenshot
-    extracted_text = pytesseract.image_to_string(processed_img, config="--psm 6 -l eng")
-
-    # Use a regular expression to find all occurrences of the substring
-    return re.findall(re.escape(string), extracted_text)
+    extracted_data = pytesseract.image_to_data(processed_img, config="--psm 6 -l eng",
+                                               output_type=pytesseract.Output.DATAFRAME)
+
+    occurrences = []
+    locations = []
+
+    for i, row in extracted_data.iterrows():
+        if pd.isna(row['text']):
+            continue
+        found_positions = [m.start() for m in re.finditer(re.escape(string), row['text'])]
+        for pos in found_positions:
+            occurrences.append(string)
+            locations.append({
+                'left': row['left'] + pos,
+                'top': row['top'],
+                'width': row['width'],
+                'height': row['height']
+            })
+
+    return occurrences, locations
 
 
 def find_all_sentence_occurrences(sentence):