Add maildir parser and tests
This commit is contained in:
parent
2c813c3242
commit
df23e520fa
@ -1,10 +1,11 @@
|
|||||||
import pathlib
|
from pathlib import Path
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
METADATA_SCHEMA = sorted(["Path", "From", "Date"])
|
METADATA_SCHEMA = sorted(["Path", "From", "Date"])
|
||||||
|
|
||||||
|
|
||||||
def make_email_metadata(email_path: str) -> dict[str, str]:
|
def make_email_metadata(email_path: str | Path) -> dict[str, str]:
|
||||||
"""Make an email metadata object by parsing the email contents
|
"""Make an email metadata object by parsing the email contents
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -14,7 +15,7 @@ def make_email_metadata(email_path: str) -> dict[str, str]:
|
|||||||
Returns: Dict containing the required metadata
|
Returns: Dict containing the required metadata
|
||||||
"""
|
"""
|
||||||
key_is_set = {key: False for key in METADATA_SCHEMA}
|
key_is_set = {key: False for key in METADATA_SCHEMA}
|
||||||
metadata = {"Path": email_path}
|
metadata = {"Path": str(email_path)}
|
||||||
key_is_set["Path"] = True
|
key_is_set["Path"] = True
|
||||||
|
|
||||||
with open(email_path, "r") as f:
|
with open(email_path, "r") as f:
|
||||||
@ -35,7 +36,7 @@ def make_email_metadata(email_path: str) -> dict[str, str]:
|
|||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
def parse_maildir(path_to_dir: str | pathlib.Path):
|
def parse_maildir(path_to_dir: str | Path):
|
||||||
"""Parse all of the emails within the specified maildir box (not recursively)
|
"""Parse all of the emails within the specified maildir box (not recursively)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -44,7 +45,17 @@ def parse_maildir(path_to_dir: str | pathlib.Path):
|
|||||||
Returns: MailDir object initialized with email information
|
Returns: MailDir object initialized with email information
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return MailDir(_)
|
file_list = Path(path_to_dir).glob("*")
|
||||||
|
email_metadata = [make_email_metadata(file) for file in file_list]
|
||||||
|
return MailDir(email_metadata)
|
||||||
|
|
||||||
|
|
||||||
|
class TopSender:
|
||||||
|
"""Simple class to store the top sender alongside the first 5 names they used"""
|
||||||
|
|
||||||
|
def __init__(self, email: str, names: list[str]):
|
||||||
|
self.email = email
|
||||||
|
self.names = names
|
||||||
|
|
||||||
|
|
||||||
class MailDir:
|
class MailDir:
|
||||||
@ -52,5 +63,39 @@ class MailDir:
|
|||||||
Stores the metadata associated with all local emails.
|
Stores the metadata associated with all local emails.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
KEYS_AND_FUNCS = {
|
||||||
|
"Name": lambda df: df["From"].map(lambda x: x.split("<")[0].strip('" ')),
|
||||||
|
"Email": lambda df: df["From"].map(lambda x: x.split("<")[1].strip(">")),
|
||||||
|
"Date": lambda df: pd.to_datetime(df["Date"]),
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, email_metadata: list[dict[str, str]]):
|
def __init__(self, email_metadata: list[dict[str, str]]):
|
||||||
pass
|
if email_metadata:
|
||||||
|
self._df = pd.DataFrame(email_metadata)
|
||||||
|
|
||||||
|
for k, func in self.KEYS_AND_FUNCS.items():
|
||||||
|
self._df[k] = func(self._df)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self._df = pd.DataFrame(
|
||||||
|
columns=METADATA_SCHEMA + list(self.KEYS_AND_FUNCS.keys())
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_top_n_senders(self, n: int) -> list[TopSender]:
|
||||||
|
"""Calculate the top n senders and returns their information as a TopSender object
|
||||||
|
The TopSender object
|
||||||
|
Args:
|
||||||
|
n: Number of senders to retrieve
|
||||||
|
|
||||||
|
Returns: list of TopSender objects
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
senders = [
|
||||||
|
TopSender(
|
||||||
|
email, list(self._df.loc[self._df["Email"] == email, "Name"].unique())
|
||||||
|
)
|
||||||
|
for email in self._df["Email"].value_counts().iloc[0:n].index
|
||||||
|
]
|
||||||
|
|
||||||
|
return senders
|
||||||
|
@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory
|
|||||||
def tmp_dir():
|
def tmp_dir():
|
||||||
tmp_dir = TemporaryDirectory(delete=False)
|
tmp_dir = TemporaryDirectory(delete=False)
|
||||||
print("Created temporary directory ", tmp_dir.name)
|
print("Created temporary directory ", tmp_dir.name)
|
||||||
yield tmp_dir.name
|
yield Path(tmp_dir.name)
|
||||||
print("Cleaned temporary directory", tmp_dir.name)
|
print("Cleaned temporary directory", tmp_dir.name)
|
||||||
tmp_dir.cleanup()
|
tmp_dir.cleanup()
|
||||||
|
|
||||||
@ -46,7 +46,7 @@ def test_email(tmp_dir):
|
|||||||
X-Google-Smtp-Source: AGHT+IGQyWQO69p9mhCHt5N5NbKLfb9Ij9fgRFGjk+UJNpRo3S9VPDV6pXXucyU0xAL3AiT5jNtO16w=
|
X-Google-Smtp-Source: AGHT+IGQyWQO69p9mhCHt5N5NbKLfb9Ij9fgRFGjk+UJNpRo3S9VPDV6pXXucyU0xAL3AiT5jNtO16w=
|
||||||
X-Received: by 2002:a25:fb02:0:b0:664:f31a:2be0 with SMTP id u2-20020a25fb02000000b00664f31a2be0mr13538287ybg.36.1713373420812;
|
X-Received: by 2002:a25:fb02:0:b0:664:f31a:2be0 with SMTP id u2-20020a25fb02000000b00664f31a2be0mr13538287ybg.36.1713373420812;
|
||||||
Wed, 16 Apr 2025 09:23:40 -0700 (PDT)
|
Wed, 16 Apr 2025 09:23:40 -0700 (PDT)
|
||||||
From: "John Doe" sender@example.com
|
From: "John Doe" <sender@example.com>
|
||||||
To: recipient@example.org
|
To: recipient@example.org
|
||||||
Subject: Sample email for parsing exercises
|
Subject: Sample email for parsing exercises
|
||||||
Date: Wed, 16 Apr 2025 12:23:35 -0400
|
Date: Wed, 16 Apr 2025 12:23:35 -0400
|
||||||
@ -92,7 +92,46 @@ def test_email(tmp_dir):
|
|||||||
Phone: (555) 123-4567
|
Phone: (555) 123-4567
|
||||||
"""
|
"""
|
||||||
|
|
||||||
email_path = Path(tmp_dir) / "test_email.txt"
|
email_path = tmp_dir / "test_email.txt"
|
||||||
with open(email_path, "w") as f:
|
with open(email_path, "w") as f:
|
||||||
f.write(test_contents)
|
f.write(test_contents)
|
||||||
yield email_path
|
return email_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_email_dir(tmp_dir):
|
||||||
|
# Include only the necessary meta data since we have validated single email parsing
|
||||||
|
sample_emails = [
|
||||||
|
"""
|
||||||
|
From: Test <test@something.org>
|
||||||
|
Date: Wed, 10 Apr 2025 12:23:35 -0400
|
||||||
|
""",
|
||||||
|
"""
|
||||||
|
From: Not a Test <not_a_test@something.org>
|
||||||
|
Date: Wed, 16 Apr 2024 08:23:35 -0400
|
||||||
|
""",
|
||||||
|
"""
|
||||||
|
From: "Test2" <test@something.org>
|
||||||
|
Date: Wed, 11 Apr 2025 12:23:35 -0400
|
||||||
|
""",
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, email in enumerate(sample_emails):
|
||||||
|
with open(tmp_dir / f"{i}", "w") as f:
|
||||||
|
f.write(email)
|
||||||
|
return tmp_dir
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_email_metadata():
|
||||||
|
return [
|
||||||
|
{"From": "John Doe <john.doe@example.com>", "Date": "2025-01-01"},
|
||||||
|
{"From": "John Doe <john.doe@example.com>", "Date": "2025-01-02"},
|
||||||
|
{"From": "Johnny Doe <john.doe@example.com>", "Date": "2025-01-03"},
|
||||||
|
{"From": "J. Doe <john.doe@example.com>", "Date": "2025-01-04"},
|
||||||
|
{"From": "Jane Smith <jane.smith@example.com>", "Date": "2025-01-05"},
|
||||||
|
{"From": "Jane S. <jane.smith@example.com>", "Date": "2025-01-06"},
|
||||||
|
{"From": "Alex Johnson <alex.johnson@example.com>", "Date": "2025-01-07"},
|
||||||
|
{"From": "Alex J. <alex.johnson@example.com>", "Date": "2025-01-08"},
|
||||||
|
{"From": "Sarah Williams <sarah@example.com>", "Date": "2025-01-09"},
|
||||||
|
]
|
||||||
|
@ -1,11 +1,102 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
from fixtures import *
|
from fixtures import *
|
||||||
from maildirclean.maildir import make_email_metadata
|
from maildirclean.maildir import make_email_metadata, MailDir, TopSender, parse_maildir
|
||||||
|
|
||||||
|
|
||||||
def test_email_parsing(test_email):
|
def test_email_parsing(test_email):
|
||||||
|
|
||||||
metadata = make_email_metadata(test_email)
|
metadata = make_email_metadata(test_email)
|
||||||
|
|
||||||
assert metadata["From"] == '"John Doe" sender@example.com'
|
assert metadata["From"] == '"John Doe" <sender@example.com>'
|
||||||
assert metadata["Date"] == "Wed, 16 Apr 2025 12:23:35 -0400"
|
assert metadata["Date"] == "Wed, 16 Apr 2025 12:23:35 -0400"
|
||||||
assert metadata["Path"] == test_email
|
assert metadata["Path"] == str(test_email)
|
||||||
|
|
||||||
|
|
||||||
|
def test_maildir_creation(test_email):
|
||||||
|
maildir = MailDir([make_email_metadata(test_email)])
|
||||||
|
|
||||||
|
metadata = maildir._df.iloc[0]
|
||||||
|
assert metadata["From"] == '"John Doe" <sender@example.com>'
|
||||||
|
assert metadata["Name"] == "John Doe"
|
||||||
|
assert metadata["Email"] == "sender@example.com"
|
||||||
|
assert metadata["Date"] == pd.to_datetime("Wed, 16 Apr 2025 12:23:35 -0400")
|
||||||
|
assert metadata["Path"] == str(test_email)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_top_n_senders(sample_email_metadata):
|
||||||
|
# Initialize MailDir with sample data
|
||||||
|
maildir = MailDir(sample_email_metadata)
|
||||||
|
|
||||||
|
# Test getting top 2 senders
|
||||||
|
top_senders = maildir.get_top_n_senders(2)
|
||||||
|
|
||||||
|
# Assertions
|
||||||
|
assert len(top_senders) == 2
|
||||||
|
|
||||||
|
# john.doe@example.com should be the top sender (4 emails)
|
||||||
|
assert top_senders[0].email == "john.doe@example.com"
|
||||||
|
assert set(top_senders[0].names) == {"John Doe", "Johnny Doe", "J. Doe"}
|
||||||
|
|
||||||
|
# jane.smith@example.com should be the second (2 emails)
|
||||||
|
assert top_senders[1].email == "jane.smith@example.com"
|
||||||
|
assert set(top_senders[1].names) == {"Jane Smith", "Jane S."}
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_top_n_senders_with_empty_data():
|
||||||
|
# Initialize MailDir with empty data
|
||||||
|
maildir = MailDir([])
|
||||||
|
|
||||||
|
# Test getting top senders from empty data
|
||||||
|
top_senders = maildir.get_top_n_senders(5)
|
||||||
|
|
||||||
|
# Should return empty list
|
||||||
|
assert len(top_senders) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_top_n_senders_with_n_greater_than_unique_senders(sample_email_metadata):
|
||||||
|
# Initialize MailDir with sample data
|
||||||
|
maildir = MailDir(sample_email_metadata)
|
||||||
|
|
||||||
|
# Test getting more senders than exist
|
||||||
|
top_senders = maildir.get_top_n_senders(10)
|
||||||
|
|
||||||
|
# Should only return 4 senders (as there are only 4 unique emails)
|
||||||
|
assert len(top_senders) == 4
|
||||||
|
|
||||||
|
# Verify all expected emails are present
|
||||||
|
emails = [sender.email for sender in top_senders]
|
||||||
|
assert set(emails) == {
|
||||||
|
"john.doe@example.com",
|
||||||
|
"jane.smith@example.com",
|
||||||
|
"alex.johnson@example.com",
|
||||||
|
"sarah@example.com",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_top_n_senders_ordering(sample_email_metadata):
|
||||||
|
# Initialize MailDir with sample data
|
||||||
|
maildir = MailDir(sample_email_metadata)
|
||||||
|
|
||||||
|
# Test getting all senders
|
||||||
|
top_senders = maildir.get_top_n_senders(4)
|
||||||
|
|
||||||
|
# Verify ordering by count
|
||||||
|
assert [sender.email for sender in top_senders] == [
|
||||||
|
"john.doe@example.com", # 4 emails
|
||||||
|
"jane.smith@example.com", # 2 emails
|
||||||
|
"alex.johnson@example.com", # 2 emails
|
||||||
|
"sarah@example.com", # 1 email
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_maildir(sample_email_dir):
|
||||||
|
maildir = parse_maildir(sample_email_dir)
|
||||||
|
|
||||||
|
assert len(maildir._df) == 3
|
||||||
|
assert "test@something.org" in list(maildir._df["Email"])
|
||||||
|
assert "not_a_test@something.org" in list(maildir._df["Email"])
|
||||||
|
|
||||||
|
assert "Test" in list(maildir._df["Name"])
|
||||||
|
assert "Not a Test" in list(maildir._df["Name"])
|
||||||
|
assert "Test2" in list(maildir._df["Name"])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user