From 88899dfbd428cfc07d88352912588e6f6a0160b1 Mon Sep 17 00:00:00 2001 From: Alex Selimov Date: Sat, 19 Apr 2025 10:25:13 -0400 Subject: [PATCH] Add cli, top sender calculation functionality --- pyproject.toml | 12 ++----- src/maildirclean/__init__.py | 7 ++-- src/maildirclean/cli.py | 70 ++++++++++++++++++++++++++++++++++++ src/maildirclean/maildir.py | 34 ++++++++++++------ src/maildirclean/utility.py | 28 +++++++++++++++ tests/fixtures.py | 18 +++++----- tests/test_maildir.py | 26 +++++++------- 7 files changed, 152 insertions(+), 43 deletions(-) create mode 100644 src/maildirclean/cli.py create mode 100644 src/maildirclean/utility.py diff --git a/pyproject.toml b/pyproject.toml index a1cdeca..e014b2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,13 +3,9 @@ name = "maildirclean" version = "0.1.0" description = "Add your description here" readme = "README.md" -authors = [ - { name = "Alex Selimov", email = "alex@alexselimov.com" } -] +authors = [{ name = "Alex Selimov", email = "alex@alexselimov.com" }] requires-python = ">=3.13" -dependencies = [ - "pandas>=2.2.3", -] +dependencies = ["pandas>=2.2.3"] [project.scripts] maildirclean = "maildirclean:main" @@ -19,6 +15,4 @@ requires = ["hatchling"] build-backend = "hatchling.build" [dependency-groups] -dev = [ - "pytest>=8.3.5", -] +dev = ["pytest>=8.3.5"] diff --git a/src/maildirclean/__init__.py b/src/maildirclean/__init__.py index c9d146d..4463371 100644 --- a/src/maildirclean/__init__.py +++ b/src/maildirclean/__init__.py @@ -1,2 +1,5 @@ -def main() -> None: - print("Hello from maildirclean!") +from .cli import cli + + +def main() -> int: + return cli() diff --git a/src/maildirclean/cli.py b/src/maildirclean/cli.py new file mode 100644 index 0000000..d6b46ef --- /dev/null +++ b/src/maildirclean/cli.py @@ -0,0 +1,70 @@ +import argparse +import sys +from pathlib import Path + +from .maildir import parse_maildir + + +def parse_arguments() -> argparse.Namespace: + """Parse command line arguments + + Returns: Namespace object corresponding to parsed arguments + + """ + parser = argparse.ArgumentParser( + description="Analyze email metadata from a maildir directory" + ) + parser.add_argument( + "maildir", type=str, help="Path to the maildir directory to analyze" + ) + parser.add_argument( + "--top", + "-t", + type=int, + default=5, + help="Number of top senders to display (default: 5)", + ) + parser.add_argument( + "--verbose", "-v", action="store_true", help="Enable verbose output" + ) + + args = parser.parse_args() + return args + + +def cli(): + args = parse_arguments() + + maildir_path = Path(args.maildir) + if not maildir_path.exists() or not maildir_path.is_dir(): + print(f"Error: {args.maildir} is not a valid directory", file=sys.stderr) + return 1 + + if args.verbose: + print(f"Analyzing emails in {maildir_path}...") + + maildir = parse_maildir(maildir_path) + if args.verbose: + print(f"Found {len(maildir._df)} emails") + + top_senders = maildir.get_top_n_senders(args.top) + + if not top_senders: + print("No senders found in the maildir", file=sys.stderr) + return 0 + + result = [] + for i, sender in enumerate(top_senders, 1): + names_str = ", ".join(sender.names[:5]) # Limit to first 5 names + if len(sender.names) > 5: + names_str += f" and {len(sender.names) - 5} more" + + result.append(f"{i}. {sender.email} - Names used: {names_str}") + + output = "\n".join( + [f"Top {len(top_senders)} senders in {maildir_path}:", "=" * 40, *result] + ) + + print(output) + + return 0 diff --git a/src/maildirclean/maildir.py b/src/maildirclean/maildir.py index b13500b..d9932e7 100644 --- a/src/maildirclean/maildir.py +++ b/src/maildirclean/maildir.py @@ -1,8 +1,11 @@ from pathlib import Path import pandas as pd +import re + +from .utility import first_match_or_empty, to_datetime_safe -METADATA_SCHEMA = sorted(["Path", "From", "Date"]) +METADATA_SCHEMA = sorted(["path", "from", "date"]) def make_email_metadata(email_path: str | Path) -> dict[str, str]: @@ -15,13 +18,17 @@ def make_email_metadata(email_path: str | Path) -> dict[str, str]: Returns: Dict containing the required metadata """ key_is_set = {key: False for key in METADATA_SCHEMA} - metadata = {"Path": str(email_path)} - key_is_set["Path"] = True + metadata = {"path": str(email_path)} + key_is_set["path"] = True - with open(email_path, "r") as f: + with open(email_path, "rb") as f: for line in f: try: - k, v = [val.strip() for val in line.split(":", maxsplit=1)] + k, v = [ + val.strip() + for val in line.decode(errors="ignore").split(":", maxsplit=1) + ] + k = k.lower() if k in METADATA_SCHEMA: metadata[k] = v key_is_set[k] = True @@ -63,10 +70,17 @@ class MailDir: Stores the metadata associated with all local emails. """ + name_regex = r"^(.*?)(?=<)" + email_regex = r"?" + KEYS_AND_FUNCS = { - "Name": lambda df: df["From"].map(lambda x: x.split("<")[0].strip('" ')), - "Email": lambda df: df["From"].map(lambda x: x.split("<")[1].strip(">")), - "Date": lambda df: pd.to_datetime(df["Date"]), + "name": lambda df: df["from"].map( + lambda x: first_match_or_empty(MailDir.name_regex, x).strip('" ') + ), + "email": lambda df: df["from"].map( + lambda x: first_match_or_empty(MailDir.email_regex, x).strip("") + ), + "date": lambda df: df["date"].map(lambda x: to_datetime_safe(x)), } def __init__(self, email_metadata: list[dict[str, str]]): @@ -93,9 +107,9 @@ class MailDir: senders = [ TopSender( - email, list(self._df.loc[self._df["Email"] == email, "Name"].unique()) + email, list(self._df.loc[self._df["email"] == email, "name"].unique()) ) - for email in self._df["Email"].value_counts().iloc[0:n].index + for email in self._df["email"].value_counts().iloc[0:n].index ] return senders diff --git a/src/maildirclean/utility.py b/src/maildirclean/utility.py new file mode 100644 index 0000000..483c110 --- /dev/null +++ b/src/maildirclean/utility.py @@ -0,0 +1,28 @@ +import re +import pandas as pd +from datetime import datetime + + +def first_match_or_empty(pattern: str, text: str) -> str: + """Get the first match for the provided pattern or "" if empty. + This is a wrapper to facilitate usage of the re.search in lambda expressions + + Args: + pattern: Pattern to search for + text: Text that will be searched + + Returns: First pattern match or "" + + """ + match = re.search(pattern, text) + if match: + return match.group(1) + else: + return "" + + +def to_datetime_safe(datetime_str: str): + try: + return pd.to_datetime(datetime_str, format="mixed", errors="coerce") + except (ValueError, AttributeError): + return datetime.now() diff --git a/tests/fixtures.py b/tests/fixtures.py index 1ac2b79..a471597 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -125,13 +125,13 @@ def sample_email_dir(tmp_dir): @pytest.fixture def sample_email_metadata(): return [ - {"From": "John Doe ", "Date": "2025-01-01"}, - {"From": "John Doe ", "Date": "2025-01-02"}, - {"From": "Johnny Doe ", "Date": "2025-01-03"}, - {"From": "J. Doe ", "Date": "2025-01-04"}, - {"From": "Jane Smith ", "Date": "2025-01-05"}, - {"From": "Jane S. ", "Date": "2025-01-06"}, - {"From": "Alex Johnson ", "Date": "2025-01-07"}, - {"From": "Alex J. ", "Date": "2025-01-08"}, - {"From": "Sarah Williams ", "Date": "2025-01-09"}, + {"from": "John Doe ", "date": "2025-01-01"}, + {"from": "John Doe ", "date": "2025-01-02"}, + {"from": "Johnny Doe ", "date": "2025-01-03"}, + {"from": "J. Doe ", "date": "2025-01-04"}, + {"from": "Jane Smith ", "date": "2025-01-05"}, + {"from": "Jane S. ", "date": "2025-01-06"}, + {"from": "Alex Johnson ", "date": "2025-01-07"}, + {"from": "Alex J. ", "date": "2025-01-08"}, + {"from": "Sarah Williams ", "date": "2025-01-09"}, ] diff --git a/tests/test_maildir.py b/tests/test_maildir.py index 4351420..710ca18 100644 --- a/tests/test_maildir.py +++ b/tests/test_maildir.py @@ -8,20 +8,20 @@ def test_email_parsing(test_email): metadata = make_email_metadata(test_email) - assert metadata["From"] == '"John Doe" ' - assert metadata["Date"] == "Wed, 16 Apr 2025 12:23:35 -0400" - assert metadata["Path"] == str(test_email) + assert metadata["from"] == '"John Doe" ' + assert metadata["date"] == "Wed, 16 Apr 2025 12:23:35 -0400" + assert metadata["path"] == str(test_email) def test_maildir_creation(test_email): maildir = MailDir([make_email_metadata(test_email)]) metadata = maildir._df.iloc[0] - assert metadata["From"] == '"John Doe" ' - assert metadata["Name"] == "John Doe" - assert metadata["Email"] == "sender@example.com" - assert metadata["Date"] == pd.to_datetime("Wed, 16 Apr 2025 12:23:35 -0400") - assert metadata["Path"] == str(test_email) + assert metadata["from"] == '"John Doe" ' + assert metadata["name"] == "John Doe" + assert metadata["email"] == "sender@example.com" + assert metadata["date"] == pd.to_datetime("Wed, 16 Apr 2025 12:23:35 -0400") + assert metadata["path"] == str(test_email) def test_get_top_n_senders(sample_email_metadata): @@ -94,9 +94,9 @@ def test_parse_maildir(sample_email_dir): maildir = parse_maildir(sample_email_dir) assert len(maildir._df) == 3 - assert "test@something.org" in list(maildir._df["Email"]) - assert "not_a_test@something.org" in list(maildir._df["Email"]) + assert "test@something.org" in list(maildir._df["email"]) + assert "not_a_test@something.org" in list(maildir._df["email"]) - assert "Test" in list(maildir._df["Name"]) - assert "Not a Test" in list(maildir._df["Name"]) - assert "Test2" in list(maildir._df["Name"]) + assert "Test" in list(maildir._df["name"]) + assert "Not a Test" in list(maildir._df["name"]) + assert "Test2" in list(maildir._df["name"])