Add maildir parser and tests
This commit is contained in:
parent
2c813c3242
commit
df23e520fa
@ -1,10 +1,11 @@
|
||||
import pathlib
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
|
||||
METADATA_SCHEMA = sorted(["Path", "From", "Date"])
|
||||
|
||||
|
||||
def make_email_metadata(email_path: str) -> dict[str, str]:
|
||||
def make_email_metadata(email_path: str | Path) -> dict[str, str]:
|
||||
"""Make an email metadata object by parsing the email contents
|
||||
|
||||
Args:
|
||||
@ -14,7 +15,7 @@ def make_email_metadata(email_path: str) -> dict[str, str]:
|
||||
Returns: Dict containing the required metadata
|
||||
"""
|
||||
key_is_set = {key: False for key in METADATA_SCHEMA}
|
||||
metadata = {"Path": email_path}
|
||||
metadata = {"Path": str(email_path)}
|
||||
key_is_set["Path"] = True
|
||||
|
||||
with open(email_path, "r") as f:
|
||||
@ -35,7 +36,7 @@ def make_email_metadata(email_path: str) -> dict[str, str]:
|
||||
return metadata
|
||||
|
||||
|
||||
def parse_maildir(path_to_dir: str | pathlib.Path):
|
||||
def parse_maildir(path_to_dir: str | Path):
|
||||
"""Parse all of the emails within the specified maildir box (not recursively)
|
||||
|
||||
Args:
|
||||
@ -44,7 +45,17 @@ def parse_maildir(path_to_dir: str | pathlib.Path):
|
||||
Returns: MailDir object initialized with email information
|
||||
|
||||
"""
|
||||
return MailDir(_)
|
||||
file_list = Path(path_to_dir).glob("*")
|
||||
email_metadata = [make_email_metadata(file) for file in file_list]
|
||||
return MailDir(email_metadata)
|
||||
|
||||
|
||||
class TopSender:
|
||||
"""Simple class to store the top sender alongside the first 5 names they used"""
|
||||
|
||||
def __init__(self, email: str, names: list[str]):
|
||||
self.email = email
|
||||
self.names = names
|
||||
|
||||
|
||||
class MailDir:
|
||||
@ -52,5 +63,39 @@ class MailDir:
|
||||
Stores the metadata associated with all local emails.
|
||||
"""
|
||||
|
||||
KEYS_AND_FUNCS = {
|
||||
"Name": lambda df: df["From"].map(lambda x: x.split("<")[0].strip('" ')),
|
||||
"Email": lambda df: df["From"].map(lambda x: x.split("<")[1].strip(">")),
|
||||
"Date": lambda df: pd.to_datetime(df["Date"]),
|
||||
}
|
||||
|
||||
def __init__(self, email_metadata: list[dict[str, str]]):
|
||||
pass
|
||||
if email_metadata:
|
||||
self._df = pd.DataFrame(email_metadata)
|
||||
|
||||
for k, func in self.KEYS_AND_FUNCS.items():
|
||||
self._df[k] = func(self._df)
|
||||
|
||||
else:
|
||||
self._df = pd.DataFrame(
|
||||
columns=METADATA_SCHEMA + list(self.KEYS_AND_FUNCS.keys())
|
||||
)
|
||||
|
||||
def get_top_n_senders(self, n: int) -> list[TopSender]:
|
||||
"""Calculate the top n senders and returns their information as a TopSender object
|
||||
The TopSender object
|
||||
Args:
|
||||
n: Number of senders to retrieve
|
||||
|
||||
Returns: list of TopSender objects
|
||||
|
||||
"""
|
||||
|
||||
senders = [
|
||||
TopSender(
|
||||
email, list(self._df.loc[self._df["Email"] == email, "Name"].unique())
|
||||
)
|
||||
for email in self._df["Email"].value_counts().iloc[0:n].index
|
||||
]
|
||||
|
||||
return senders
|
||||
|
@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory
|
||||
def tmp_dir():
|
||||
tmp_dir = TemporaryDirectory(delete=False)
|
||||
print("Created temporary directory ", tmp_dir.name)
|
||||
yield tmp_dir.name
|
||||
yield Path(tmp_dir.name)
|
||||
print("Cleaned temporary directory", tmp_dir.name)
|
||||
tmp_dir.cleanup()
|
||||
|
||||
@ -46,7 +46,7 @@ def test_email(tmp_dir):
|
||||
X-Google-Smtp-Source: AGHT+IGQyWQO69p9mhCHt5N5NbKLfb9Ij9fgRFGjk+UJNpRo3S9VPDV6pXXucyU0xAL3AiT5jNtO16w=
|
||||
X-Received: by 2002:a25:fb02:0:b0:664:f31a:2be0 with SMTP id u2-20020a25fb02000000b00664f31a2be0mr13538287ybg.36.1713373420812;
|
||||
Wed, 16 Apr 2025 09:23:40 -0700 (PDT)
|
||||
From: "John Doe" sender@example.com
|
||||
From: "John Doe" <sender@example.com>
|
||||
To: recipient@example.org
|
||||
Subject: Sample email for parsing exercises
|
||||
Date: Wed, 16 Apr 2025 12:23:35 -0400
|
||||
@ -92,7 +92,46 @@ def test_email(tmp_dir):
|
||||
Phone: (555) 123-4567
|
||||
"""
|
||||
|
||||
email_path = Path(tmp_dir) / "test_email.txt"
|
||||
email_path = tmp_dir / "test_email.txt"
|
||||
with open(email_path, "w") as f:
|
||||
f.write(test_contents)
|
||||
yield email_path
|
||||
return email_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_email_dir(tmp_dir):
|
||||
# Include only the necessary meta data since we have validated single email parsing
|
||||
sample_emails = [
|
||||
"""
|
||||
From: Test <test@something.org>
|
||||
Date: Wed, 10 Apr 2025 12:23:35 -0400
|
||||
""",
|
||||
"""
|
||||
From: Not a Test <not_a_test@something.org>
|
||||
Date: Wed, 16 Apr 2024 08:23:35 -0400
|
||||
""",
|
||||
"""
|
||||
From: "Test2" <test@something.org>
|
||||
Date: Wed, 11 Apr 2025 12:23:35 -0400
|
||||
""",
|
||||
]
|
||||
|
||||
for i, email in enumerate(sample_emails):
|
||||
with open(tmp_dir / f"{i}", "w") as f:
|
||||
f.write(email)
|
||||
return tmp_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_email_metadata():
|
||||
return [
|
||||
{"From": "John Doe <john.doe@example.com>", "Date": "2025-01-01"},
|
||||
{"From": "John Doe <john.doe@example.com>", "Date": "2025-01-02"},
|
||||
{"From": "Johnny Doe <john.doe@example.com>", "Date": "2025-01-03"},
|
||||
{"From": "J. Doe <john.doe@example.com>", "Date": "2025-01-04"},
|
||||
{"From": "Jane Smith <jane.smith@example.com>", "Date": "2025-01-05"},
|
||||
{"From": "Jane S. <jane.smith@example.com>", "Date": "2025-01-06"},
|
||||
{"From": "Alex Johnson <alex.johnson@example.com>", "Date": "2025-01-07"},
|
||||
{"From": "Alex J. <alex.johnson@example.com>", "Date": "2025-01-08"},
|
||||
{"From": "Sarah Williams <sarah@example.com>", "Date": "2025-01-09"},
|
||||
]
|
||||
|
@ -1,11 +1,102 @@
|
||||
import pandas as pd
|
||||
|
||||
from fixtures import *
|
||||
from maildirclean.maildir import make_email_metadata
|
||||
from maildirclean.maildir import make_email_metadata, MailDir, TopSender, parse_maildir
|
||||
|
||||
|
||||
def test_email_parsing(test_email):
|
||||
|
||||
metadata = make_email_metadata(test_email)
|
||||
|
||||
assert metadata["From"] == '"John Doe" sender@example.com'
|
||||
assert metadata["From"] == '"John Doe" <sender@example.com>'
|
||||
assert metadata["Date"] == "Wed, 16 Apr 2025 12:23:35 -0400"
|
||||
assert metadata["Path"] == test_email
|
||||
assert metadata["Path"] == str(test_email)
|
||||
|
||||
|
||||
def test_maildir_creation(test_email):
|
||||
maildir = MailDir([make_email_metadata(test_email)])
|
||||
|
||||
metadata = maildir._df.iloc[0]
|
||||
assert metadata["From"] == '"John Doe" <sender@example.com>'
|
||||
assert metadata["Name"] == "John Doe"
|
||||
assert metadata["Email"] == "sender@example.com"
|
||||
assert metadata["Date"] == pd.to_datetime("Wed, 16 Apr 2025 12:23:35 -0400")
|
||||
assert metadata["Path"] == str(test_email)
|
||||
|
||||
|
||||
def test_get_top_n_senders(sample_email_metadata):
|
||||
# Initialize MailDir with sample data
|
||||
maildir = MailDir(sample_email_metadata)
|
||||
|
||||
# Test getting top 2 senders
|
||||
top_senders = maildir.get_top_n_senders(2)
|
||||
|
||||
# Assertions
|
||||
assert len(top_senders) == 2
|
||||
|
||||
# john.doe@example.com should be the top sender (4 emails)
|
||||
assert top_senders[0].email == "john.doe@example.com"
|
||||
assert set(top_senders[0].names) == {"John Doe", "Johnny Doe", "J. Doe"}
|
||||
|
||||
# jane.smith@example.com should be the second (2 emails)
|
||||
assert top_senders[1].email == "jane.smith@example.com"
|
||||
assert set(top_senders[1].names) == {"Jane Smith", "Jane S."}
|
||||
|
||||
|
||||
def test_get_top_n_senders_with_empty_data():
|
||||
# Initialize MailDir with empty data
|
||||
maildir = MailDir([])
|
||||
|
||||
# Test getting top senders from empty data
|
||||
top_senders = maildir.get_top_n_senders(5)
|
||||
|
||||
# Should return empty list
|
||||
assert len(top_senders) == 0
|
||||
|
||||
|
||||
def test_get_top_n_senders_with_n_greater_than_unique_senders(sample_email_metadata):
|
||||
# Initialize MailDir with sample data
|
||||
maildir = MailDir(sample_email_metadata)
|
||||
|
||||
# Test getting more senders than exist
|
||||
top_senders = maildir.get_top_n_senders(10)
|
||||
|
||||
# Should only return 4 senders (as there are only 4 unique emails)
|
||||
assert len(top_senders) == 4
|
||||
|
||||
# Verify all expected emails are present
|
||||
emails = [sender.email for sender in top_senders]
|
||||
assert set(emails) == {
|
||||
"john.doe@example.com",
|
||||
"jane.smith@example.com",
|
||||
"alex.johnson@example.com",
|
||||
"sarah@example.com",
|
||||
}
|
||||
|
||||
|
||||
def test_get_top_n_senders_ordering(sample_email_metadata):
|
||||
# Initialize MailDir with sample data
|
||||
maildir = MailDir(sample_email_metadata)
|
||||
|
||||
# Test getting all senders
|
||||
top_senders = maildir.get_top_n_senders(4)
|
||||
|
||||
# Verify ordering by count
|
||||
assert [sender.email for sender in top_senders] == [
|
||||
"john.doe@example.com", # 4 emails
|
||||
"jane.smith@example.com", # 2 emails
|
||||
"alex.johnson@example.com", # 2 emails
|
||||
"sarah@example.com", # 1 email
|
||||
]
|
||||
|
||||
|
||||
def test_parse_maildir(sample_email_dir):
|
||||
maildir = parse_maildir(sample_email_dir)
|
||||
|
||||
assert len(maildir._df) == 3
|
||||
assert "test@something.org" in list(maildir._df["Email"])
|
||||
assert "not_a_test@something.org" in list(maildir._df["Email"])
|
||||
|
||||
assert "Test" in list(maildir._df["Name"])
|
||||
assert "Not a Test" in list(maildir._df["Name"])
|
||||
assert "Test2" in list(maildir._df["Name"])
|
||||
|
Loading…
x
Reference in New Issue
Block a user