enthus-appdev · Hinne1 · Jan 7, 2025 · Oct 26, 2024 · Jan 6, 2025 · Jan 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,10 @@ wheels/
 
 # venv
 .venv
+
+# tests
+tests/*
+output.jpg
+
+# Mac
+.DS_Store
diff --git a/README.md b/README.md
@@ -1,9 +1,13 @@
+This application generates jpg previews of .msg and .eml email files.
+
 # Setup
 install rye (or any other python pkg manager of choice, you'll have to install the dependencies yourself)
 
-`rye sync`
+install rust (`brew install rust` or https://www.rust-lang.org/tools/install)
 
+`rye sync`
 
+# Starting the program (local)
 `python src/main.py`
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,7 @@ dependencies = [
     "extract-msg>=0.50.0",
     "pillow>=10.4.0",
     "beautifulsoup4>=4.12.3",
+    "fast-mail-parser>=0.2.5",
 ]
 readme = "README.md"
 requires-python = ">= 3.8"

diff --git a/requirements-dev.lock b/requirements-dev.lock
@@ -31,6 +31,8 @@ ebcdic==1.1.1
     # via extract-msg
 extract-msg==0.50.0
     # via files-preview-python-api
+fast-mail-parser==0.2.5
+    # via files-preview-python-api
 flask==3.0.3
     # via files-preview-python-api
 itsdangerous==2.2.0

diff --git a/requirements.lock b/requirements.lock
@@ -31,6 +31,8 @@ ebcdic==1.1.1
     # via extract-msg
 extract-msg==0.50.0
     # via files-preview-python-api
+fast-mail-parser==0.2.5
+    # via files-preview-python-api
 flask==3.0.3
     # via files-preview-python-api
 itsdangerous==2.2.0

diff --git a/src/image_functions.py b/src/image_functions.py
@@ -0,0 +1,40 @@
+from PIL import Image, ImageDraw, ImageFont
+
+def write_text_to_image(text, output_image_path):
+    """Writes given text to an image."""
+    image_width = 1920
+    image_height = 1080
+
+    # We need a hardcoded font because the default font doesn't support all characters
+    font = ImageFont.truetype("Arial.ttf", 20)
+
+    # Create the image
+    image = create_image((image_width, image_height), 'white', text, font, 'black')
+
+    # Save the image as .jpg
+    image.save(output_image_path, 'JPEG')
+
+    print(f"Image saved as: {output_image_path}")
+
+def create_image(size, bgColor, text, font, fontColor):
+    """
+    Creates an image with the specified size, background color, message, font, and font color.
+
+    Args:
+        size (tuple): The size of the image in pixels, specified as a tuple (width, height).
+        bgColor (str): The background color of the image in RGB format.
+        message (str): The message to be displayed on the image.
+        font (PIL.ImageFont): The font to be used for the message.
+        fontColor (str): The color of the message in RGB format.
+
+    Returns:
+        PIL.Image.Image: The created image.
+    """
+    W, H = size
+    image = Image.new('RGB', size, bgColor)
+    draw = ImageDraw.Draw(image)
+
+    # Draw the given text anchored to the center of the image, with the given font and color.
+    draw.text((W/2, H/2), text, anchor="mm", font=font, fill=fontColor)
+
+    return image
diff --git a/src/main.py b/src/main.py
@@ -1,74 +1,12 @@
 from flask import Flask, request, send_file, jsonify
+from fast_mail_parser import parse_email, ParseError
 import extract_msg
-from PIL import Image, ImageDraw, ImageFont
-from bs4 import BeautifulSoup
-from email import policy
-from email.parser import BytesParser
+import text_functions
+import image_functions
 import os
-import textwrap
 
 app = Flask(__name__)
 
-def convert_html_to_text(html_content):
-    """Convert HTML content to plain text using BeautifulSoup."""
-    soup = BeautifulSoup(html_content, 'html.parser')
-    return soup.get_text()
-
-def extract_email_body_from_msg(msg_file_path):
-    """Extract body content from a .msg file."""
-    # TODO might have to handle HTML content as well.
-    msg = extract_msg.Message(msg_file_path)
-
-    return msg.body
-
-def extract_email_body_from_eml(eml_file_path):
-    """Extract body content from a .eml file."""
-    with open(eml_file_path, 'rb') as f:
-        msg = BytesParser(policy=policy.default).parse(f)
-
-    # Check for HTML or plain text part
-    if msg.is_multipart():
-        for part in msg.iter_parts():
-            if part.get_content_type() == 'text/html':
-                return convert_html_to_text(part.get_payload(decode=True).decode())
-            elif part.get_content_type() == 'text/plain':
-                return part.get_payload(decode=True).decode()
-    else:
-        # Non-multipart email, directly return text/plain or HTML content
-        if msg.get_content_type() == 'text/html':
-            return convert_html_to_text(msg.get_payload(decode=True).decode())
-        else:
-            return msg.get_payload(decode=True).decode()
-
-def convert_email_to_image(body_content, output_image_path):
-    """Convert email body content to an image."""
-    # Set up image
-    ## TODO probably need to adjust these values based on the content
-    image_width = 800
-    image_height = 600
-    padding = 20
-    font = ImageFont.load_default()
-
-    # Text wrapping for proper formatting
-    wrapped_text = textwrap.fill(body_content, width=100)
-
-    # TODO: Calculate the height of the image based on the wrapped text and center text
-    dummy_image = Image.new('RGB', (image_width, 1), color=(255, 255, 255))
-    draw = ImageDraw.Draw(dummy_image)
-    # _, _, _, image_height = draw.multiline_textbbox((0, 0), text=wrapped_text, font=font)
-    print(image_width, image_height)
-
-    # Create the actual image
-    image = Image.new('RGB', (image_width, image_height), color=(255, 255, 255))
-    draw = ImageDraw.Draw(image)
-
-    # Draw the text on the image
-    draw.text((padding, padding), wrapped_text, font=font, fill=(0, 0, 0))
-
-    # Save the image as .jpg
-    image.save(output_image_path, 'JPEG')
-    print(f"Image saved as: {output_image_path}")
-
 @app.route('/converter', methods=['POST'])
 def convert_email():
     if 'file' not in request.files:
@@ -79,28 +17,51 @@ def convert_email():
     if file.filename == '':
         return jsonify({"error": "No selected file"}), 400
 
-    # TODO might have to check file by header instead of extension.
-    if file and (file.filename.endswith('.msg') or file.filename.endswith('.eml')):
-        filename = file.filename
-        file_path = os.path.join('/tmp', filename)
-        file.save(file_path)
+    if file is None:
+        return jsonify({"error": "No valid file provided"}), 400
 
-        # Extract body content based on file type
-        if filename.endswith('.msg'):
-            body_content = extract_email_body_from_msg(file_path)
-        elif filename.endswith('.eml'):
-            body_content = extract_email_body_from_eml(file_path)
-        else:
-            return jsonify({"error": "Unsupported file format"}), 400
+    # Save the file to /tmp
+    filename = file.filename
+    file_path = os.path.join('/tmp', filename)
+    file.save(file_path)
 
-        # Convert email body to image
-        output_image_path = os.path.join('/tmp', 'output.jpg')
-        convert_email_to_image(body_content, output_image_path)
+    # Check file by content
+    # Check if it's a .msg file.
+    try:
+        extract_msg.openMsg(file_path)
+        is_msg = True
+    except:
+        is_msg = False
+        pass
 
-        # Return the image file as a response
-        return send_file(output_image_path, mimetype='image/jpeg')
 
-    return jsonify({"error": "Unsupported file format"}), 400
+    # Check if it's a .eml file.
+    try:
+        with open(file_path, 'r') as f:
+            message_payload = f.read()
+
+        _ = parse_email(message_payload)
+        is_eml = True
+
+    # UnicodeDecodeError is raised when the file is not a text file (e.g. an .msg file)
+    except (ParseError, UnicodeDecodeError): 
+        is_eml = False
+        pass
+
+    # Extract text from the file
+    if is_msg:
+        text = text_functions.build_email_text_from_msg(file_path)
+    elif is_eml:
+        text = text_functions.build_email_text_from_eml(file_path)
+    else:
+        return jsonify({"error": "File has no supported file type: eml, msg"}), 400
+
+    # Write extracted text to image
+    output_image_path = os.path.join('/tmp', 'output.jpg')
+    image_functions.write_text_to_image(text, output_image_path)
+
+    # Return the image file as a response
+    return send_file(output_image_path, mimetype='image/jpeg')
 
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=8082) # TODO change port number via params
diff --git a/src/text_functions.py b/src/text_functions.py
@@ -0,0 +1,84 @@
+from fast_mail_parser import parse_email
+from bs4 import BeautifulSoup
+import extract_msg
+import textwrap
+import re
+
+def convert_html_to_text(html_content):
+    """Converts HTML content to plain text using BeautifulSoup."""
+    soup = BeautifulSoup(html_content, 'html.parser')
+    return soup.get_text()
+
+def build_email_text_from_msg(msg_file_path):
+    """Extracts content from a .msg file and format the resulting text."""
+
+    msg = extract_msg.openMsg(msg_file_path)
+
+    # Wrap message body
+    msg.body = wrap_text(msg.body, 160)
+
+    # Build text from the content of the email
+    text = u'Datum: ' + msg.date.strftime("%d.%m.%Y") + '\nVon: ' + msg.sender + '\nAn: ' + msg.to + '\nBetreff: ' + msg.subject + '\n\nNachricht:\n' + msg.body[:1000]
+
+    # Get Attachment Filenames add to email text
+    attachmentFilenames = ''
+    for attachment in msg.attachments:
+        attachmentFilenames += attachment.getFilename() + '\n'
+    if attachmentFilenames:
+        text += '\n\nAnhänge:\n' + attachmentFilenames
+
+    # Remove some special characters that don't get displayed correctly
+    text = text.replace('\r', '').replace('\t', '')
+
+    # Remove duplicate spaces
+    text = re.sub(' {2,}', ' ', text)
+
+    return convert_html_to_text(text)
+
+def build_email_text_from_eml(eml_file_path):
+    """Extracts content from a .eml file and format the resulting text."""
+
+    with open(eml_file_path, 'r') as f:
+        message_payload = f.read()
+
+    msg = parse_email(message_payload)
+
+    # Wrap message body
+    body = wrap_text(msg.text_plain[0], 160)
+
+    # Build text from the content of the email
+    text = u'Datum: ' + msg.date + '\nBetreff: ' + msg.subject + '\n\nNachricht:\n' + body[:1000]
+
+    # Get Attachment Filenames add to email text
+    attachmentFilenames = ''
+    for attachment in msg.attachments:
+        attachmentFilenames += attachment.filename + '\n'
+    if attachmentFilenames:
+        text += '\n\nAnhänge:\n' + attachmentFilenames
+
+    # Remove some special characters that don't get displayed correctly
+    text = text.replace('\r', '').replace('\t', '')
+
+    # Remove duplicate spaces
+    text = re.sub(' {2,}', ' ', text)
+
+    return convert_html_to_text(text)
+
+def wrap_text(text, width):
+    """
+    Wraps the given text to the specified width. Preserves existing single line breaks.
+
+    Args:
+        text (str): The text to be wrapped.
+        width (int): The maximum width of each line.
+
+    Returns:
+        str: The wrapped text.
+
+    """
+
+    text = '\n'.join(['\n'.join(textwrap.wrap(line, width,
+            break_long_words=False, replace_whitespace=False))
+            for line in text.splitlines(keepends=True) if line.strip() != ''])
+
+    return text
diff --git a/tests/Kunde enthus TEST GmbH hat eine Bestellung über enthus connect getätigt(1).msg b/tests/Kunde enthus TEST GmbH hat eine Bestellung über enthus connect getätigt(1).msg
diff --git a/...uelle Freigabe notwendig_ Kunde MCL TEST GmbH hat eine Bestellung über myMCL getätigt.msg b/...uelle Freigabe notwendig_ Kunde MCL TEST GmbH hat eine Bestellung über myMCL getätigt.msg
diff --git a/tests/asd.msg b/tests/asd.msg
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,10 @@ wheels/ @@
     # venv
     .venv
+    # tests
+    tests/*
+    output.jpg
+    # Mac
+    .DS_Store