Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,10 @@ wheels/

# venv
.venv

# tests
tests/*
output.jpg

# Mac
.DS_Store
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
This application generates jpg previews of .msg and .eml email files.

# Setup
install rye (or any other python pkg manager of choice, you'll have to install the dependencies yourself)

`rye sync`
install rust (`brew install rust` or https://www.rust-lang.org/tools/install)

`rye sync`

# Starting the program (local)
`python src/main.py`


Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies = [
"extract-msg>=0.50.0",
"pillow>=10.4.0",
"beautifulsoup4>=4.12.3",
"fast-mail-parser>=0.2.5",
]
readme = "README.md"
requires-python = ">= 3.8"
Expand Down
2 changes: 2 additions & 0 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ ebcdic==1.1.1
# via extract-msg
extract-msg==0.50.0
# via files-preview-python-api
fast-mail-parser==0.2.5
# via files-preview-python-api
flask==3.0.3
# via files-preview-python-api
itsdangerous==2.2.0
Expand Down
2 changes: 2 additions & 0 deletions requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ ebcdic==1.1.1
# via extract-msg
extract-msg==0.50.0
# via files-preview-python-api
fast-mail-parser==0.2.5
# via files-preview-python-api
flask==3.0.3
# via files-preview-python-api
itsdangerous==2.2.0
Expand Down
40 changes: 40 additions & 0 deletions src/image_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from PIL import Image, ImageDraw, ImageFont

def write_text_to_image(text, output_image_path):
"""Writes given text to an image."""
image_width = 1920
image_height = 1080

# We need a hardcoded font because the default font doesn't support all characters
font = ImageFont.truetype("Arial.ttf", 20)

# Create the image
image = create_image((image_width, image_height), 'white', text, font, 'black')

# Save the image as .jpg
image.save(output_image_path, 'JPEG')

print(f"Image saved as: {output_image_path}")

def create_image(size, bgColor, text, font, fontColor):
"""
Creates an image with the specified size, background color, message, font, and font color.

Args:
size (tuple): The size of the image in pixels, specified as a tuple (width, height).
bgColor (str): The background color of the image in RGB format.
message (str): The message to be displayed on the image.
font (PIL.ImageFont): The font to be used for the message.
fontColor (str): The color of the message in RGB format.

Returns:
PIL.Image.Image: The created image.
"""
W, H = size
image = Image.new('RGB', size, bgColor)
draw = ImageDraw.Draw(image)

# Draw the given text anchored to the center of the image, with the given font and color.
draw.text((W/2, H/2), text, anchor="mm", font=font, fill=fontColor)

return image
127 changes: 44 additions & 83 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,74 +1,12 @@
from flask import Flask, request, send_file, jsonify
from fast_mail_parser import parse_email, ParseError
import extract_msg
from PIL import Image, ImageDraw, ImageFont
from bs4 import BeautifulSoup
from email import policy
from email.parser import BytesParser
import text_functions
import image_functions
import os
import textwrap

app = Flask(__name__)

def convert_html_to_text(html_content):
"""Convert HTML content to plain text using BeautifulSoup."""
soup = BeautifulSoup(html_content, 'html.parser')
return soup.get_text()

def extract_email_body_from_msg(msg_file_path):
"""Extract body content from a .msg file."""
# TODO might have to handle HTML content as well.
msg = extract_msg.Message(msg_file_path)

return msg.body

def extract_email_body_from_eml(eml_file_path):
"""Extract body content from a .eml file."""
with open(eml_file_path, 'rb') as f:
msg = BytesParser(policy=policy.default).parse(f)

# Check for HTML or plain text part
if msg.is_multipart():
for part in msg.iter_parts():
if part.get_content_type() == 'text/html':
return convert_html_to_text(part.get_payload(decode=True).decode())
elif part.get_content_type() == 'text/plain':
return part.get_payload(decode=True).decode()
else:
# Non-multipart email, directly return text/plain or HTML content
if msg.get_content_type() == 'text/html':
return convert_html_to_text(msg.get_payload(decode=True).decode())
else:
return msg.get_payload(decode=True).decode()

def convert_email_to_image(body_content, output_image_path):
"""Convert email body content to an image."""
# Set up image
## TODO probably need to adjust these values based on the content
image_width = 800
image_height = 600
padding = 20
font = ImageFont.load_default()

# Text wrapping for proper formatting
wrapped_text = textwrap.fill(body_content, width=100)

# TODO: Calculate the height of the image based on the wrapped text and center text
dummy_image = Image.new('RGB', (image_width, 1), color=(255, 255, 255))
draw = ImageDraw.Draw(dummy_image)
# _, _, _, image_height = draw.multiline_textbbox((0, 0), text=wrapped_text, font=font)
print(image_width, image_height)

# Create the actual image
image = Image.new('RGB', (image_width, image_height), color=(255, 255, 255))
draw = ImageDraw.Draw(image)

# Draw the text on the image
draw.text((padding, padding), wrapped_text, font=font, fill=(0, 0, 0))

# Save the image as .jpg
image.save(output_image_path, 'JPEG')
print(f"Image saved as: {output_image_path}")

@app.route('/converter', methods=['POST'])
def convert_email():
if 'file' not in request.files:
Expand All @@ -79,28 +17,51 @@ def convert_email():
if file.filename == '':
return jsonify({"error": "No selected file"}), 400

# TODO might have to check file by header instead of extension.
if file and (file.filename.endswith('.msg') or file.filename.endswith('.eml')):
filename = file.filename
file_path = os.path.join('/tmp', filename)
file.save(file_path)
if file is None:
return jsonify({"error": "No valid file provided"}), 400

# Extract body content based on file type
if filename.endswith('.msg'):
body_content = extract_email_body_from_msg(file_path)
elif filename.endswith('.eml'):
body_content = extract_email_body_from_eml(file_path)
else:
return jsonify({"error": "Unsupported file format"}), 400
# Save the file to /tmp
filename = file.filename
file_path = os.path.join('/tmp', filename)
file.save(file_path)

# Convert email body to image
output_image_path = os.path.join('/tmp', 'output.jpg')
convert_email_to_image(body_content, output_image_path)
# Check file by content
# Check if it's a .msg file.
try:
extract_msg.openMsg(file_path)
is_msg = True
except:
is_msg = False
pass

# Return the image file as a response
return send_file(output_image_path, mimetype='image/jpeg')

return jsonify({"error": "Unsupported file format"}), 400
# Check if it's a .eml file.
try:
with open(file_path, 'r') as f:
message_payload = f.read()

_ = parse_email(message_payload)
is_eml = True

# UnicodeDecodeError is raised when the file is not a text file (e.g. an .msg file)
except (ParseError, UnicodeDecodeError):
is_eml = False
pass

# Extract text from the file
if is_msg:
text = text_functions.build_email_text_from_msg(file_path)
elif is_eml:
text = text_functions.build_email_text_from_eml(file_path)
else:
return jsonify({"error": "File has no supported file type: eml, msg"}), 400

# Write extracted text to image
output_image_path = os.path.join('/tmp', 'output.jpg')
image_functions.write_text_to_image(text, output_image_path)

# Return the image file as a response
return send_file(output_image_path, mimetype='image/jpeg')

if __name__ == '__main__':
app.run(host='0.0.0.0', port=8082) # TODO change port number via params
84 changes: 84 additions & 0 deletions src/text_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from fast_mail_parser import parse_email
from bs4 import BeautifulSoup
import extract_msg
import textwrap
import re

def convert_html_to_text(html_content):
"""Converts HTML content to plain text using BeautifulSoup."""
soup = BeautifulSoup(html_content, 'html.parser')
return soup.get_text()

def build_email_text_from_msg(msg_file_path):
"""Extracts content from a .msg file and format the resulting text."""

msg = extract_msg.openMsg(msg_file_path)

# Wrap message body
msg.body = wrap_text(msg.body, 160)

# Build text from the content of the email
text = u'Datum: ' + msg.date.strftime("%d.%m.%Y") + '\nVon: ' + msg.sender + '\nAn: ' + msg.to + '\nBetreff: ' + msg.subject + '\n\nNachricht:\n' + msg.body[:1000]

# Get Attachment Filenames add to email text
attachmentFilenames = ''
for attachment in msg.attachments:
attachmentFilenames += attachment.getFilename() + '\n'
if attachmentFilenames:
text += '\n\nAnhänge:\n' + attachmentFilenames

# Remove some special characters that don't get displayed correctly
text = text.replace('\r', '').replace('\t', '')

# Remove duplicate spaces
text = re.sub(' {2,}', ' ', text)

return convert_html_to_text(text)

def build_email_text_from_eml(eml_file_path):
"""Extracts content from a .eml file and format the resulting text."""

with open(eml_file_path, 'r') as f:
message_payload = f.read()

msg = parse_email(message_payload)

# Wrap message body
body = wrap_text(msg.text_plain[0], 160)

# Build text from the content of the email
text = u'Datum: ' + msg.date + '\nBetreff: ' + msg.subject + '\n\nNachricht:\n' + body[:1000]

# Get Attachment Filenames add to email text
attachmentFilenames = ''
for attachment in msg.attachments:
attachmentFilenames += attachment.filename + '\n'
if attachmentFilenames:
text += '\n\nAnhänge:\n' + attachmentFilenames

# Remove some special characters that don't get displayed correctly
text = text.replace('\r', '').replace('\t', '')

# Remove duplicate spaces
text = re.sub(' {2,}', ' ', text)

return convert_html_to_text(text)

def wrap_text(text, width):
"""
Wraps the given text to the specified width. Preserves existing single line breaks.

Args:
text (str): The text to be wrapped.
width (int): The maximum width of each line.

Returns:
str: The wrapped text.

"""

text = '\n'.join(['\n'.join(textwrap.wrap(line, width,
break_long_words=False, replace_whitespace=False))
for line in text.splitlines(keepends=True) if line.strip() != ''])

return text
Binary file not shown.
Binary file not shown.
Binary file removed tests/asd.msg
Binary file not shown.
Loading