From 95126011246e91d57e3450c27f332c7f07930c4c Mon Sep 17 00:00:00 2001 From: Leon Date: Wed, 16 Jul 2025 18:38:06 +0200 Subject: [PATCH] feat: use message-id header to only process message once --- backend/app/crud/entries.py | 6 +++ backend/app/models/entries.py | 2 + backend/app/schemas/entries.py | 4 ++ backend/app/services/email_processor.py | 20 +++++++++- backend/app/tests/test_core.py | 51 +++++++++++++++++++++++-- backend/app/tests/test_crud.py | 29 +++++++++++--- backend/app/tests/test_routers.py | 12 +++++- backend/app/tests/test_services.py | 10 ++++- 8 files changed, 120 insertions(+), 14 deletions(-) diff --git a/backend/app/crud/entries.py b/backend/app/crud/entries.py index 4456aac..3f8455e 100644 --- a/backend/app/crud/entries.py +++ b/backend/app/crud/entries.py @@ -23,6 +23,12 @@ def get_entries_by_newsletter( ) +def get_entry_by_message_id(db: Session, message_id: str): + """Retrieve an entry by its message_id.""" + logger.debug(f"Querying for entry with message_id={message_id}") + return db.query(Entry).filter(Entry.message_id == message_id).first() + + def create_entry(db: Session, entry: EntryCreate, newsletter_id: int): """Create a new entry for a newsletter.""" logger.info( diff --git a/backend/app/models/entries.py b/backend/app/models/entries.py index ebb1724..6c1569a 100644 --- a/backend/app/models/entries.py +++ b/backend/app/models/entries.py @@ -8,6 +8,7 @@ from app.core.database import Base class Entry(Base): """Represents an entry (e.g., an email) associated with a newsletter.""" + __tablename__ = "entries" id = Column(Integer, primary_key=True, index=True) @@ -17,5 +18,6 @@ class Entry(Base): received_at = Column( DateTime(timezone=True), default=datetime.datetime.now(datetime.UTC) ) + message_id = Column(String, unique=True, index=True, nullable=False) newsletter = relationship("Newsletter", back_populates="entries") diff --git a/backend/app/schemas/entries.py b/backend/app/schemas/entries.py index 8b16e15..3c46f93 100644 --- a/backend/app/schemas/entries.py +++ b/backend/app/schemas/entries.py @@ -5,17 +5,21 @@ from pydantic import BaseModel, ConfigDict class EntryBase(BaseModel): """Base schema for an entry.""" + subject: str body: str + message_id: str class EntryCreate(EntryBase): """Schema for creating a new entry.""" + pass class Entry(EntryBase): """Schema for retrieving an entry with its ID and newsletter ID.""" + id: int newsletter_id: int received_at: datetime.datetime diff --git a/backend/app/services/email_processor.py b/backend/app/services/email_processor.py index 85ee6af..657b37e 100644 --- a/backend/app/services/email_processor.py +++ b/backend/app/services/email_processor.py @@ -4,7 +4,7 @@ import imaplib from sqlalchemy.orm import Session from app.core.logging import get_logger -from app.crud.entries import create_entry +from app.crud.entries import create_entry, get_entry_by_message_id from app.crud.newsletters import create_newsletter, get_newsletters from app.crud.settings import get_settings from app.schemas.entries import EntryCreate @@ -56,6 +56,20 @@ def process_emails(db: Session): msg = email.message_from_bytes(data[0][1]) sender = email.utils.parseaddr(msg["From"])[1] + message_id = msg.get("Message-ID") + + if not message_id: + logger.warning( + f"Email from {sender} with subject '{msg['Subject']}' has no Message-ID, skipping." + ) + continue + + if get_entry_by_message_id(db, message_id): + logger.info( + f"Email with Message-ID {message_id} already processed, skipping." + ) + continue + logger.debug( f"Processing email from {sender} with subject '{msg['Subject']}'" ) @@ -100,7 +114,9 @@ def process_emails(db: Session): final_body = html or body - entry = EntryCreate(subject=subject, body=final_body) + entry = EntryCreate( + subject=subject, body=final_body, message_id=message_id + ) create_entry(db, entry, newsletter.id) logger.info( f"Created new entry for newsletter '{newsletter.name}' from sender {sender}" diff --git a/backend/app/tests/test_core.py b/backend/app/tests/test_core.py index e03ed71..83b4185 100644 --- a/backend/app/tests/test_core.py +++ b/backend/app/tests/test_core.py @@ -70,7 +70,7 @@ def test_process_emails(mock_imap, db_session: Session): mock_mail.search.return_value = ("OK", [b"1"]) # Mock email content - mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\n\nTest Body" + mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\nMessage-ID: \n\nTest Body" mock_mail.fetch.return_value = ("OK", [(None, mock_msg_bytes)]) process_emails(db_session) @@ -151,7 +151,7 @@ def test_process_emails_auto_add_sender(mock_imap, db_session: Session): mock_mail = MagicMock() mock_imap.return_value = mock_mail mock_mail.search.return_value = ("OK", [b"1"]) - mock_msg_bytes = b"From: New Sender \nSubject: New Email\n\nHello" + mock_msg_bytes = b"From: New Sender \nSubject: New Email\nMessage-ID: \n\nHello" mock_mail.fetch.return_value = ("OK", [(None, mock_msg_bytes)]) process_emails(db_session) @@ -192,10 +192,55 @@ def test_process_emails_no_move_or_read(mock_imap, db_session: Session): mock_mail = MagicMock() mock_imap.return_value = mock_mail mock_mail.search.return_value = ("OK", [b"1"]) - mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\n\nTest Body" + mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\nMessage-ID: \n\nTest Body" mock_mail.fetch.return_value = ("OK", [(None, mock_msg_bytes)]) process_emails(db_session) mock_mail.store.assert_not_called() mock_mail.copy.assert_not_called() + + +@patch("app.services.email_processor.imaplib.IMAP4_SSL") +def test_process_emails_avoids_duplicates(mock_imap, db_session: Session): + """Test that process_emails avoids processing duplicate emails.""" + settings_data = SettingsCreate( + imap_server="imap.test.com", + imap_username="test@test.com", + imap_password="password", + ) + create_or_update_settings(db_session, settings_data) + newsletter_data = NewsletterCreate( + name="Test Newsletter", sender_emails=["newsletter@example.com"] + ) + newsletter = create_newsletter(db_session, newsletter_data) + + # Create an entry that already exists + from app.crud.entries import create_entry + from app.schemas.entries import EntryCreate + + create_entry( + db_session, + EntryCreate( + subject="Existing Subject", + body="Existing Body", + message_id="", + ), + newsletter.id, + ) + + mock_mail = MagicMock() + mock_imap.return_value = mock_mail + mock_mail.search.return_value = ("OK", [b"1"]) + # This email has the same Message-ID as the one we just created + mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\nMessage-ID: \n\nTest Body" + mock_mail.fetch.return_value = ("OK", [(None, mock_msg_bytes)]) + + process_emails(db_session) + + # Verify that no new entry was created + from app.crud.entries import get_entries_by_newsletter + + entries = get_entries_by_newsletter(db_session, newsletter.id) + assert len(entries) == 1 + assert entries[0].subject == "Existing Subject" diff --git a/backend/app/tests/test_crud.py b/backend/app/tests/test_crud.py index 145bdde..90f00e9 100644 --- a/backend/app/tests/test_crud.py +++ b/backend/app/tests/test_crud.py @@ -148,7 +148,11 @@ def test_create_entry(db_session: Session): name="Test Newsletter 5", sender_emails=[unique_email] ) newsletter = create_newsletter(db_session, newsletter_data) - entry_data = EntryCreate(subject="Test Subject", body="Test Body") + entry_data = EntryCreate( + subject="Test Subject", + body="Test Body", + message_id=f"<{uuid.uuid4()}@test.com>", + ) entry = create_entry(db_session, entry_data, newsletter.id) assert entry.subject == "Test Subject" assert entry.newsletter_id == newsletter.id @@ -162,10 +166,18 @@ def test_get_entries_by_newsletter(db_session: Session): ) newsletter = create_newsletter(db_session, newsletter_data) create_entry( - db_session, EntryCreate(subject="Entry 1", body="Body 1"), newsletter.id + db_session, + EntryCreate( + subject="Entry 1", body="Body 1", message_id=f"<{uuid.uuid4()}@test.com>" + ), + newsletter.id, ) create_entry( - db_session, EntryCreate(subject="Entry 2", body="Body 2"), newsletter.id + db_session, + EntryCreate( + subject="Entry 2", body="Body 2", message_id=f"<{uuid.uuid4()}@test.com>" + ), + newsletter.id, ) entries = get_entries_by_newsletter(db_session, newsletter.id) assert len(entries) == 2 @@ -183,9 +195,14 @@ def test_update_newsletter(db_session: Session): from app.schemas.newsletters import NewsletterUpdate updated_email = f"updated_sender_{uuid.uuid4()}@test.com" - updated_newsletter_data = NewsletterUpdate(name="Updated Newsletter", sender_emails=[updated_email]) + updated_newsletter_data = NewsletterUpdate( + name="Updated Newsletter", sender_emails=[updated_email] + ) from app.crud.newsletters import update_newsletter - updated_newsletter = update_newsletter(db_session, newsletter.id, updated_newsletter_data) + + updated_newsletter = update_newsletter( + db_session, newsletter.id, updated_newsletter_data + ) assert updated_newsletter.name == "Updated Newsletter" assert len(updated_newsletter.senders) == 1 @@ -201,6 +218,7 @@ def test_delete_newsletter(db_session: Session): newsletter = create_newsletter(db_session, newsletter_data) from app.crud.newsletters import delete_newsletter + deleted_newsletter = delete_newsletter(db_session, newsletter.id) assert deleted_newsletter.id == newsletter.id @@ -208,4 +226,5 @@ def test_delete_newsletter(db_session: Session): # Verify it's actually deleted from app.crud.newsletters import get_newsletter + assert get_newsletter(db_session, newsletter.id) is None diff --git a/backend/app/tests/test_routers.py b/backend/app/tests/test_routers.py index ea9016a..40bcf81 100644 --- a/backend/app/tests/test_routers.py +++ b/backend/app/tests/test_routers.py @@ -164,9 +164,17 @@ def test_get_newsletter_feed(client: TestClient): newsletter_id = create_response.json()["id"] # Add some entries to the newsletter - entry_data_1 = {"subject": "Test Entry 1", "body": "

Content 1

"} + entry_data_1 = { + "subject": "Test Entry 1", + "body": "

Content 1

", + "message_id": f"", + } client.post(f"/newsletters/{newsletter_id}/entries", json=entry_data_1) - entry_data_2 = {"subject": "Test Entry 2", "body": "

Content 2

"} + entry_data_2 = { + "subject": "Test Entry 2", + "body": "

Content 2

", + "message_id": f"", + } client.post(f"/newsletters/{newsletter_id}/entries", json=entry_data_2) response = client.get(f"/feeds/{newsletter_id}") diff --git a/backend/app/tests/test_services.py b/backend/app/tests/test_services.py index a42528c..1e5b49d 100644 --- a/backend/app/tests/test_services.py +++ b/backend/app/tests/test_services.py @@ -1,3 +1,5 @@ +import uuid + from sqlalchemy.orm import Session from app.crud.entries import create_entry @@ -17,12 +19,16 @@ def test_generate_feed(db_session: Session): # Create entries for the newsletter entry1_data = EntryCreate( - subject="First Entry", body="

This is the first entry.

" + subject="First Entry", + body="

This is the first entry.

", + message_id=f"<{uuid.uuid4()}@test.com>", ) create_entry(db_session, entry1_data, newsletter.id) entry2_data = EntryCreate( - subject="Second Entry", body="

This is the second entry.

" + subject="Second Entry", + body="

This is the second entry.

", + message_id=f"<{uuid.uuid4()}@test.com>", ) create_entry(db_session, entry2_data, newsletter.id)