From 452f80f47ea55848247f669f70db70666f895f27 Mon Sep 17 00:00:00 2001
From: midays <midays@redhat.com>
Date: Tue, 19 Sep 2023 11:55:35 +0300
Subject: [PATCH] updated the function in ocr so it returns also the location
 of the strings found

---
 src/utils/ocr.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/utils/ocr.py b/src/utils/ocr.py
index 47a8e99..0228685 100644
--- a/src/utils/ocr.py
+++ b/src/utils/ocr.py
@@ -2,6 +2,7 @@
 
 import cv2
 import numpy as np
+import pandas as pd
 import pyautogui
 import pytesseract
 
@@ -19,10 +20,26 @@ def find_all_string_occurrences(string):
     # Preprocess the image
     processed_img = preprocess_image(screenshot_bgr)
     # Use pytesseract to extract text from the screenshot
-    extracted_text = pytesseract.image_to_string(processed_img, config="--psm 6 -l eng")
-
-    # Use a regular expression to find all occurrences of the substring
-    return re.findall(re.escape(string), extracted_text)
+    extracted_data = pytesseract.image_to_data(processed_img, config="--psm 6 -l eng",
+                                               output_type=pytesseract.Output.DATAFRAME)
+
+    occurrences = []
+    locations = []
+
+    for i, row in extracted_data.iterrows():
+        if pd.isna(row['text']):
+            continue
+        found_positions = [m.start() for m in re.finditer(re.escape(string), row['text'])]
+        for pos in found_positions:
+            occurrences.append(string)
+            locations.append({
+                'left': row['left'] + pos,
+                'top': row['top'],
+                'width': row['width'],
+                'height': row['height']
+            })
+
+    return occurrences, locations
 
 
 def find_all_sentence_occurrences(sentence):