Add maildir parser and tests

This commit is contained in:
Alex Selimov 2025-04-18 16:27:18 -04:00
parent 2c813c3242
commit df23e520fa
3 changed files with 188 additions and 13 deletions

View File

@ -1,10 +1,11 @@
import pathlib
from pathlib import Path
import pandas as pd
METADATA_SCHEMA = sorted(["Path", "From", "Date"])
def make_email_metadata(email_path: str) -> dict[str, str]:
def make_email_metadata(email_path: str | Path) -> dict[str, str]:
"""Make an email metadata object by parsing the email contents
Args:
@ -14,7 +15,7 @@ def make_email_metadata(email_path: str) -> dict[str, str]:
Returns: Dict containing the required metadata
"""
key_is_set = {key: False for key in METADATA_SCHEMA}
metadata = {"Path": email_path}
metadata = {"Path": str(email_path)}
key_is_set["Path"] = True
with open(email_path, "r") as f:
@ -35,7 +36,7 @@ def make_email_metadata(email_path: str) -> dict[str, str]:
return metadata
def parse_maildir(path_to_dir: str | pathlib.Path):
def parse_maildir(path_to_dir: str | Path):
"""Parse all of the emails within the specified maildir box (not recursively)
Args:
@ -44,7 +45,17 @@ def parse_maildir(path_to_dir: str | pathlib.Path):
Returns: MailDir object initialized with email information
"""
return MailDir(_)
file_list = Path(path_to_dir).glob("*")
email_metadata = [make_email_metadata(file) for file in file_list]
return MailDir(email_metadata)
class TopSender:
"""Simple class to store the top sender alongside the first 5 names they used"""
def __init__(self, email: str, names: list[str]):
self.email = email
self.names = names
class MailDir:
@ -52,5 +63,39 @@ class MailDir:
Stores the metadata associated with all local emails.
"""
KEYS_AND_FUNCS = {
"Name": lambda df: df["From"].map(lambda x: x.split("<")[0].strip('" ')),
"Email": lambda df: df["From"].map(lambda x: x.split("<")[1].strip(">")),
"Date": lambda df: pd.to_datetime(df["Date"]),
}
def __init__(self, email_metadata: list[dict[str, str]]):
pass
if email_metadata:
self._df = pd.DataFrame(email_metadata)
for k, func in self.KEYS_AND_FUNCS.items():
self._df[k] = func(self._df)
else:
self._df = pd.DataFrame(
columns=METADATA_SCHEMA + list(self.KEYS_AND_FUNCS.keys())
)
def get_top_n_senders(self, n: int) -> list[TopSender]:
"""Calculate the top n senders and returns their information as a TopSender object
The TopSender object
Args:
n: Number of senders to retrieve
Returns: list of TopSender objects
"""
senders = [
TopSender(
email, list(self._df.loc[self._df["Email"] == email, "Name"].unique())
)
for email in self._df["Email"].value_counts().iloc[0:n].index
]
return senders

View File

@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory
def tmp_dir():
tmp_dir = TemporaryDirectory(delete=False)
print("Created temporary directory ", tmp_dir.name)
yield tmp_dir.name
yield Path(tmp_dir.name)
print("Cleaned temporary directory", tmp_dir.name)
tmp_dir.cleanup()
@ -46,7 +46,7 @@ def test_email(tmp_dir):
X-Google-Smtp-Source: AGHT+IGQyWQO69p9mhCHt5N5NbKLfb9Ij9fgRFGjk+UJNpRo3S9VPDV6pXXucyU0xAL3AiT5jNtO16w=
X-Received: by 2002:a25:fb02:0:b0:664:f31a:2be0 with SMTP id u2-20020a25fb02000000b00664f31a2be0mr13538287ybg.36.1713373420812;
Wed, 16 Apr 2025 09:23:40 -0700 (PDT)
From: "John Doe" sender@example.com
From: "John Doe" <sender@example.com>
To: recipient@example.org
Subject: Sample email for parsing exercises
Date: Wed, 16 Apr 2025 12:23:35 -0400
@ -92,7 +92,46 @@ def test_email(tmp_dir):
Phone: (555) 123-4567
"""
email_path = Path(tmp_dir) / "test_email.txt"
email_path = tmp_dir / "test_email.txt"
with open(email_path, "w") as f:
f.write(test_contents)
yield email_path
return email_path
@pytest.fixture
def sample_email_dir(tmp_dir):
# Include only the necessary meta data since we have validated single email parsing
sample_emails = [
"""
From: Test <test@something.org>
Date: Wed, 10 Apr 2025 12:23:35 -0400
""",
"""
From: Not a Test <not_a_test@something.org>
Date: Wed, 16 Apr 2024 08:23:35 -0400
""",
"""
From: "Test2" <test@something.org>
Date: Wed, 11 Apr 2025 12:23:35 -0400
""",
]
for i, email in enumerate(sample_emails):
with open(tmp_dir / f"{i}", "w") as f:
f.write(email)
return tmp_dir
@pytest.fixture
def sample_email_metadata():
return [
{"From": "John Doe <john.doe@example.com>", "Date": "2025-01-01"},
{"From": "John Doe <john.doe@example.com>", "Date": "2025-01-02"},
{"From": "Johnny Doe <john.doe@example.com>", "Date": "2025-01-03"},
{"From": "J. Doe <john.doe@example.com>", "Date": "2025-01-04"},
{"From": "Jane Smith <jane.smith@example.com>", "Date": "2025-01-05"},
{"From": "Jane S. <jane.smith@example.com>", "Date": "2025-01-06"},
{"From": "Alex Johnson <alex.johnson@example.com>", "Date": "2025-01-07"},
{"From": "Alex J. <alex.johnson@example.com>", "Date": "2025-01-08"},
{"From": "Sarah Williams <sarah@example.com>", "Date": "2025-01-09"},
]

View File

@ -1,11 +1,102 @@
import pandas as pd
from fixtures import *
from maildirclean.maildir import make_email_metadata
from maildirclean.maildir import make_email_metadata, MailDir, TopSender, parse_maildir
def test_email_parsing(test_email):
metadata = make_email_metadata(test_email)
assert metadata["From"] == '"John Doe" sender@example.com'
assert metadata["From"] == '"John Doe" <sender@example.com>'
assert metadata["Date"] == "Wed, 16 Apr 2025 12:23:35 -0400"
assert metadata["Path"] == test_email
assert metadata["Path"] == str(test_email)
def test_maildir_creation(test_email):
maildir = MailDir([make_email_metadata(test_email)])
metadata = maildir._df.iloc[0]
assert metadata["From"] == '"John Doe" <sender@example.com>'
assert metadata["Name"] == "John Doe"
assert metadata["Email"] == "sender@example.com"
assert metadata["Date"] == pd.to_datetime("Wed, 16 Apr 2025 12:23:35 -0400")
assert metadata["Path"] == str(test_email)
def test_get_top_n_senders(sample_email_metadata):
# Initialize MailDir with sample data
maildir = MailDir(sample_email_metadata)
# Test getting top 2 senders
top_senders = maildir.get_top_n_senders(2)
# Assertions
assert len(top_senders) == 2
# john.doe@example.com should be the top sender (4 emails)
assert top_senders[0].email == "john.doe@example.com"
assert set(top_senders[0].names) == {"John Doe", "Johnny Doe", "J. Doe"}
# jane.smith@example.com should be the second (2 emails)
assert top_senders[1].email == "jane.smith@example.com"
assert set(top_senders[1].names) == {"Jane Smith", "Jane S."}
def test_get_top_n_senders_with_empty_data():
# Initialize MailDir with empty data
maildir = MailDir([])
# Test getting top senders from empty data
top_senders = maildir.get_top_n_senders(5)
# Should return empty list
assert len(top_senders) == 0
def test_get_top_n_senders_with_n_greater_than_unique_senders(sample_email_metadata):
# Initialize MailDir with sample data
maildir = MailDir(sample_email_metadata)
# Test getting more senders than exist
top_senders = maildir.get_top_n_senders(10)
# Should only return 4 senders (as there are only 4 unique emails)
assert len(top_senders) == 4
# Verify all expected emails are present
emails = [sender.email for sender in top_senders]
assert set(emails) == {
"john.doe@example.com",
"jane.smith@example.com",
"alex.johnson@example.com",
"sarah@example.com",
}
def test_get_top_n_senders_ordering(sample_email_metadata):
# Initialize MailDir with sample data
maildir = MailDir(sample_email_metadata)
# Test getting all senders
top_senders = maildir.get_top_n_senders(4)
# Verify ordering by count
assert [sender.email for sender in top_senders] == [
"john.doe@example.com", # 4 emails
"jane.smith@example.com", # 2 emails
"alex.johnson@example.com", # 2 emails
"sarah@example.com", # 1 email
]
def test_parse_maildir(sample_email_dir):
maildir = parse_maildir(sample_email_dir)
assert len(maildir._df) == 3
assert "test@something.org" in list(maildir._df["Email"])
assert "not_a_test@something.org" in list(maildir._df["Email"])
assert "Test" in list(maildir._df["Name"])
assert "Not a Test" in list(maildir._df["Name"])
assert "Test2" in list(maildir._df["Name"])