diff --git a/.gitignore b/.gitignore index ae8554d..f11bdf6 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,10 @@ wheels/ # venv .venv + +# tests +tests/* +output.jpg + +# Mac +.DS_Store diff --git a/README.md b/README.md index 69eaa6d..0cb9db5 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,13 @@ +This application generates jpg previews of .msg and .eml email files. + # Setup install rye (or any other python pkg manager of choice, you'll have to install the dependencies yourself) -`rye sync` +install rust (`brew install rust` or https://www.rust-lang.org/tools/install) +`rye sync` +# Starting the program (local) `python src/main.py` diff --git a/pyproject.toml b/pyproject.toml index cf8da2b..b1d9158 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "extract-msg>=0.50.0", "pillow>=10.4.0", "beautifulsoup4>=4.12.3", + "fast-mail-parser>=0.2.5", ] readme = "README.md" requires-python = ">= 3.8" diff --git a/requirements-dev.lock b/requirements-dev.lock index f64f65b..05870c4 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -31,6 +31,8 @@ ebcdic==1.1.1 # via extract-msg extract-msg==0.50.0 # via files-preview-python-api +fast-mail-parser==0.2.5 + # via files-preview-python-api flask==3.0.3 # via files-preview-python-api itsdangerous==2.2.0 diff --git a/requirements.lock b/requirements.lock index f64f65b..05870c4 100644 --- a/requirements.lock +++ b/requirements.lock @@ -31,6 +31,8 @@ ebcdic==1.1.1 # via extract-msg extract-msg==0.50.0 # via files-preview-python-api +fast-mail-parser==0.2.5 + # via files-preview-python-api flask==3.0.3 # via files-preview-python-api itsdangerous==2.2.0 diff --git a/src/image_functions.py b/src/image_functions.py new file mode 100644 index 0000000..c6b457f --- /dev/null +++ b/src/image_functions.py @@ -0,0 +1,40 @@ +from PIL import Image, ImageDraw, ImageFont + +def write_text_to_image(text, output_image_path): + """Writes given text to an image.""" + image_width = 1920 + image_height = 1080 + + # We need a hardcoded font because the default font doesn't support all characters + font = ImageFont.truetype("Arial.ttf", 20) + + # Create the image + image = create_image((image_width, image_height), 'white', text, font, 'black') + + # Save the image as .jpg + image.save(output_image_path, 'JPEG') + + print(f"Image saved as: {output_image_path}") + +def create_image(size, bgColor, text, font, fontColor): + """ + Creates an image with the specified size, background color, message, font, and font color. + + Args: + size (tuple): The size of the image in pixels, specified as a tuple (width, height). + bgColor (str): The background color of the image in RGB format. + message (str): The message to be displayed on the image. + font (PIL.ImageFont): The font to be used for the message. + fontColor (str): The color of the message in RGB format. + + Returns: + PIL.Image.Image: The created image. + """ + W, H = size + image = Image.new('RGB', size, bgColor) + draw = ImageDraw.Draw(image) + + # Draw the given text anchored to the center of the image, with the given font and color. + draw.text((W/2, H/2), text, anchor="mm", font=font, fill=fontColor) + + return image \ No newline at end of file diff --git a/src/main.py b/src/main.py index 969a5c5..94e63a7 100644 --- a/src/main.py +++ b/src/main.py @@ -1,74 +1,12 @@ from flask import Flask, request, send_file, jsonify +from fast_mail_parser import parse_email, ParseError import extract_msg -from PIL import Image, ImageDraw, ImageFont -from bs4 import BeautifulSoup -from email import policy -from email.parser import BytesParser +import text_functions +import image_functions import os -import textwrap app = Flask(__name__) -def convert_html_to_text(html_content): - """Convert HTML content to plain text using BeautifulSoup.""" - soup = BeautifulSoup(html_content, 'html.parser') - return soup.get_text() - -def extract_email_body_from_msg(msg_file_path): - """Extract body content from a .msg file.""" - # TODO might have to handle HTML content as well. - msg = extract_msg.Message(msg_file_path) - - return msg.body - -def extract_email_body_from_eml(eml_file_path): - """Extract body content from a .eml file.""" - with open(eml_file_path, 'rb') as f: - msg = BytesParser(policy=policy.default).parse(f) - - # Check for HTML or plain text part - if msg.is_multipart(): - for part in msg.iter_parts(): - if part.get_content_type() == 'text/html': - return convert_html_to_text(part.get_payload(decode=True).decode()) - elif part.get_content_type() == 'text/plain': - return part.get_payload(decode=True).decode() - else: - # Non-multipart email, directly return text/plain or HTML content - if msg.get_content_type() == 'text/html': - return convert_html_to_text(msg.get_payload(decode=True).decode()) - else: - return msg.get_payload(decode=True).decode() - -def convert_email_to_image(body_content, output_image_path): - """Convert email body content to an image.""" - # Set up image - ## TODO probably need to adjust these values based on the content - image_width = 800 - image_height = 600 - padding = 20 - font = ImageFont.load_default() - - # Text wrapping for proper formatting - wrapped_text = textwrap.fill(body_content, width=100) - - # TODO: Calculate the height of the image based on the wrapped text and center text - dummy_image = Image.new('RGB', (image_width, 1), color=(255, 255, 255)) - draw = ImageDraw.Draw(dummy_image) - # _, _, _, image_height = draw.multiline_textbbox((0, 0), text=wrapped_text, font=font) - print(image_width, image_height) - - # Create the actual image - image = Image.new('RGB', (image_width, image_height), color=(255, 255, 255)) - draw = ImageDraw.Draw(image) - - # Draw the text on the image - draw.text((padding, padding), wrapped_text, font=font, fill=(0, 0, 0)) - - # Save the image as .jpg - image.save(output_image_path, 'JPEG') - print(f"Image saved as: {output_image_path}") - @app.route('/converter', methods=['POST']) def convert_email(): if 'file' not in request.files: @@ -79,28 +17,51 @@ def convert_email(): if file.filename == '': return jsonify({"error": "No selected file"}), 400 - # TODO might have to check file by header instead of extension. - if file and (file.filename.endswith('.msg') or file.filename.endswith('.eml')): - filename = file.filename - file_path = os.path.join('/tmp', filename) - file.save(file_path) + if file is None: + return jsonify({"error": "No valid file provided"}), 400 - # Extract body content based on file type - if filename.endswith('.msg'): - body_content = extract_email_body_from_msg(file_path) - elif filename.endswith('.eml'): - body_content = extract_email_body_from_eml(file_path) - else: - return jsonify({"error": "Unsupported file format"}), 400 + # Save the file to /tmp + filename = file.filename + file_path = os.path.join('/tmp', filename) + file.save(file_path) - # Convert email body to image - output_image_path = os.path.join('/tmp', 'output.jpg') - convert_email_to_image(body_content, output_image_path) + # Check file by content + # Check if it's a .msg file. + try: + extract_msg.openMsg(file_path) + is_msg = True + except: + is_msg = False + pass - # Return the image file as a response - return send_file(output_image_path, mimetype='image/jpeg') - return jsonify({"error": "Unsupported file format"}), 400 + # Check if it's a .eml file. + try: + with open(file_path, 'r') as f: + message_payload = f.read() + + _ = parse_email(message_payload) + is_eml = True + + # UnicodeDecodeError is raised when the file is not a text file (e.g. an .msg file) + except (ParseError, UnicodeDecodeError): + is_eml = False + pass + + # Extract text from the file + if is_msg: + text = text_functions.build_email_text_from_msg(file_path) + elif is_eml: + text = text_functions.build_email_text_from_eml(file_path) + else: + return jsonify({"error": "File has no supported file type: eml, msg"}), 400 + + # Write extracted text to image + output_image_path = os.path.join('/tmp', 'output.jpg') + image_functions.write_text_to_image(text, output_image_path) + + # Return the image file as a response + return send_file(output_image_path, mimetype='image/jpeg') if __name__ == '__main__': app.run(host='0.0.0.0', port=8082) # TODO change port number via params diff --git a/src/text_functions.py b/src/text_functions.py new file mode 100644 index 0000000..eb52e08 --- /dev/null +++ b/src/text_functions.py @@ -0,0 +1,84 @@ +from fast_mail_parser import parse_email +from bs4 import BeautifulSoup +import extract_msg +import textwrap +import re + +def convert_html_to_text(html_content): + """Converts HTML content to plain text using BeautifulSoup.""" + soup = BeautifulSoup(html_content, 'html.parser') + return soup.get_text() + +def build_email_text_from_msg(msg_file_path): + """Extracts content from a .msg file and format the resulting text.""" + + msg = extract_msg.openMsg(msg_file_path) + + # Wrap message body + msg.body = wrap_text(msg.body, 160) + + # Build text from the content of the email + text = u'Datum: ' + msg.date.strftime("%d.%m.%Y") + '\nVon: ' + msg.sender + '\nAn: ' + msg.to + '\nBetreff: ' + msg.subject + '\n\nNachricht:\n' + msg.body[:1000] + + # Get Attachment Filenames add to email text + attachmentFilenames = '' + for attachment in msg.attachments: + attachmentFilenames += attachment.getFilename() + '\n' + if attachmentFilenames: + text += '\n\nAnhänge:\n' + attachmentFilenames + + # Remove some special characters that don't get displayed correctly + text = text.replace('\r', '').replace('\t', '') + + # Remove duplicate spaces + text = re.sub(' {2,}', ' ', text) + + return convert_html_to_text(text) + +def build_email_text_from_eml(eml_file_path): + """Extracts content from a .eml file and format the resulting text.""" + + with open(eml_file_path, 'r') as f: + message_payload = f.read() + + msg = parse_email(message_payload) + + # Wrap message body + body = wrap_text(msg.text_plain[0], 160) + + # Build text from the content of the email + text = u'Datum: ' + msg.date + '\nBetreff: ' + msg.subject + '\n\nNachricht:\n' + body[:1000] + + # Get Attachment Filenames add to email text + attachmentFilenames = '' + for attachment in msg.attachments: + attachmentFilenames += attachment.filename + '\n' + if attachmentFilenames: + text += '\n\nAnhänge:\n' + attachmentFilenames + + # Remove some special characters that don't get displayed correctly + text = text.replace('\r', '').replace('\t', '') + + # Remove duplicate spaces + text = re.sub(' {2,}', ' ', text) + + return convert_html_to_text(text) + +def wrap_text(text, width): + """ + Wraps the given text to the specified width. Preserves existing single line breaks. + + Args: + text (str): The text to be wrapped. + width (int): The maximum width of each line. + + Returns: + str: The wrapped text. + + """ + + text = '\n'.join(['\n'.join(textwrap.wrap(line, width, + break_long_words=False, replace_whitespace=False)) + for line in text.splitlines(keepends=True) if line.strip() != '']) + + return text \ No newline at end of file diff --git "a/tests/Kunde enthus TEST GmbH hat eine Bestellung \303\274ber enthus connect get\303\244tigt(1).msg" "b/tests/Kunde enthus TEST GmbH hat eine Bestellung \303\274ber enthus connect get\303\244tigt(1).msg" deleted file mode 100644 index 7191e1f..0000000 Binary files "a/tests/Kunde enthus TEST GmbH hat eine Bestellung \303\274ber enthus connect get\303\244tigt(1).msg" and /dev/null differ diff --git "a/tests/WG_ _Manuelle Freigabe notwendig_ Kunde MCL TEST GmbH hat eine Bestellung \303\274ber myMCL get\303\244tigt.msg" "b/tests/WG_ _Manuelle Freigabe notwendig_ Kunde MCL TEST GmbH hat eine Bestellung \303\274ber myMCL get\303\244tigt.msg" deleted file mode 100644 index 9b8ad67..0000000 Binary files "a/tests/WG_ _Manuelle Freigabe notwendig_ Kunde MCL TEST GmbH hat eine Bestellung \303\274ber myMCL get\303\244tigt.msg" and /dev/null differ diff --git a/tests/asd.msg b/tests/asd.msg deleted file mode 100644 index 92ad9b0..0000000 Binary files a/tests/asd.msg and /dev/null differ