feat: use message-id header to only process message once

This commit is contained in:
Leon
2025-07-16 18:38:06 +02:00
parent d47e975574
commit 9512601124
8 changed files with 120 additions and 14 deletions

View File

@@ -23,6 +23,12 @@ def get_entries_by_newsletter(
)
def get_entry_by_message_id(db: Session, message_id: str):
"""Retrieve an entry by its message_id."""
logger.debug(f"Querying for entry with message_id={message_id}")
return db.query(Entry).filter(Entry.message_id == message_id).first()
def create_entry(db: Session, entry: EntryCreate, newsletter_id: int):
"""Create a new entry for a newsletter."""
logger.info(

View File

@@ -8,6 +8,7 @@ from app.core.database import Base
class Entry(Base):
"""Represents an entry (e.g., an email) associated with a newsletter."""
__tablename__ = "entries"
id = Column(Integer, primary_key=True, index=True)
@@ -17,5 +18,6 @@ class Entry(Base):
received_at = Column(
DateTime(timezone=True), default=datetime.datetime.now(datetime.UTC)
)
message_id = Column(String, unique=True, index=True, nullable=False)
newsletter = relationship("Newsletter", back_populates="entries")

View File

@@ -5,17 +5,21 @@ from pydantic import BaseModel, ConfigDict
class EntryBase(BaseModel):
"""Base schema for an entry."""
subject: str
body: str
message_id: str
class EntryCreate(EntryBase):
"""Schema for creating a new entry."""
pass
class Entry(EntryBase):
"""Schema for retrieving an entry with its ID and newsletter ID."""
id: int
newsletter_id: int
received_at: datetime.datetime

View File

@@ -4,7 +4,7 @@ import imaplib
from sqlalchemy.orm import Session
from app.core.logging import get_logger
from app.crud.entries import create_entry
from app.crud.entries import create_entry, get_entry_by_message_id
from app.crud.newsletters import create_newsletter, get_newsletters
from app.crud.settings import get_settings
from app.schemas.entries import EntryCreate
@@ -56,6 +56,20 @@ def process_emails(db: Session):
msg = email.message_from_bytes(data[0][1])
sender = email.utils.parseaddr(msg["From"])[1]
message_id = msg.get("Message-ID")
if not message_id:
logger.warning(
f"Email from {sender} with subject '{msg['Subject']}' has no Message-ID, skipping."
)
continue
if get_entry_by_message_id(db, message_id):
logger.info(
f"Email with Message-ID {message_id} already processed, skipping."
)
continue
logger.debug(
f"Processing email from {sender} with subject '{msg['Subject']}'"
)
@@ -100,7 +114,9 @@ def process_emails(db: Session):
final_body = html or body
entry = EntryCreate(subject=subject, body=final_body)
entry = EntryCreate(
subject=subject, body=final_body, message_id=message_id
)
create_entry(db, entry, newsletter.id)
logger.info(
f"Created new entry for newsletter '{newsletter.name}' from sender {sender}"

View File

@@ -70,7 +70,7 @@ def test_process_emails(mock_imap, db_session: Session):
mock_mail.search.return_value = ("OK", [b"1"])
# Mock email content
mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\n\nTest Body"
mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\nMessage-ID: <test@test.com>\n\nTest Body"
mock_mail.fetch.return_value = ("OK", [(None, mock_msg_bytes)])
process_emails(db_session)
@@ -151,7 +151,7 @@ def test_process_emails_auto_add_sender(mock_imap, db_session: Session):
mock_mail = MagicMock()
mock_imap.return_value = mock_mail
mock_mail.search.return_value = ("OK", [b"1"])
mock_msg_bytes = b"From: New Sender <new@example.com>\nSubject: New Email\n\nHello"
mock_msg_bytes = b"From: New Sender <new@example.com>\nSubject: New Email\nMessage-ID: <new@new.com>\n\nHello"
mock_mail.fetch.return_value = ("OK", [(None, mock_msg_bytes)])
process_emails(db_session)
@@ -192,10 +192,55 @@ def test_process_emails_no_move_or_read(mock_imap, db_session: Session):
mock_mail = MagicMock()
mock_imap.return_value = mock_mail
mock_mail.search.return_value = ("OK", [b"1"])
mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\n\nTest Body"
mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\nMessage-ID: <test@test.com>\n\nTest Body"
mock_mail.fetch.return_value = ("OK", [(None, mock_msg_bytes)])
process_emails(db_session)
mock_mail.store.assert_not_called()
mock_mail.copy.assert_not_called()
@patch("app.services.email_processor.imaplib.IMAP4_SSL")
def test_process_emails_avoids_duplicates(mock_imap, db_session: Session):
"""Test that process_emails avoids processing duplicate emails."""
settings_data = SettingsCreate(
imap_server="imap.test.com",
imap_username="test@test.com",
imap_password="password",
)
create_or_update_settings(db_session, settings_data)
newsletter_data = NewsletterCreate(
name="Test Newsletter", sender_emails=["newsletter@example.com"]
)
newsletter = create_newsletter(db_session, newsletter_data)
# Create an entry that already exists
from app.crud.entries import create_entry
from app.schemas.entries import EntryCreate
create_entry(
db_session,
EntryCreate(
subject="Existing Subject",
body="Existing Body",
message_id="<existing@message.com>",
),
newsletter.id,
)
mock_mail = MagicMock()
mock_imap.return_value = mock_mail
mock_mail.search.return_value = ("OK", [b"1"])
# This email has the same Message-ID as the one we just created
mock_msg_bytes = b"From: newsletter@example.com\nSubject: Test Subject\nMessage-ID: <existing@message.com>\n\nTest Body"
mock_mail.fetch.return_value = ("OK", [(None, mock_msg_bytes)])
process_emails(db_session)
# Verify that no new entry was created
from app.crud.entries import get_entries_by_newsletter
entries = get_entries_by_newsletter(db_session, newsletter.id)
assert len(entries) == 1
assert entries[0].subject == "Existing Subject"

View File

@@ -148,7 +148,11 @@ def test_create_entry(db_session: Session):
name="Test Newsletter 5", sender_emails=[unique_email]
)
newsletter = create_newsletter(db_session, newsletter_data)
entry_data = EntryCreate(subject="Test Subject", body="Test Body")
entry_data = EntryCreate(
subject="Test Subject",
body="Test Body",
message_id=f"<{uuid.uuid4()}@test.com>",
)
entry = create_entry(db_session, entry_data, newsletter.id)
assert entry.subject == "Test Subject"
assert entry.newsletter_id == newsletter.id
@@ -162,10 +166,18 @@ def test_get_entries_by_newsletter(db_session: Session):
)
newsletter = create_newsletter(db_session, newsletter_data)
create_entry(
db_session, EntryCreate(subject="Entry 1", body="Body 1"), newsletter.id
db_session,
EntryCreate(
subject="Entry 1", body="Body 1", message_id=f"<{uuid.uuid4()}@test.com>"
),
newsletter.id,
)
create_entry(
db_session, EntryCreate(subject="Entry 2", body="Body 2"), newsletter.id
db_session,
EntryCreate(
subject="Entry 2", body="Body 2", message_id=f"<{uuid.uuid4()}@test.com>"
),
newsletter.id,
)
entries = get_entries_by_newsletter(db_session, newsletter.id)
assert len(entries) == 2
@@ -183,9 +195,14 @@ def test_update_newsletter(db_session: Session):
from app.schemas.newsletters import NewsletterUpdate
updated_email = f"updated_sender_{uuid.uuid4()}@test.com"
updated_newsletter_data = NewsletterUpdate(name="Updated Newsletter", sender_emails=[updated_email])
updated_newsletter_data = NewsletterUpdate(
name="Updated Newsletter", sender_emails=[updated_email]
)
from app.crud.newsletters import update_newsletter
updated_newsletter = update_newsletter(db_session, newsletter.id, updated_newsletter_data)
updated_newsletter = update_newsletter(
db_session, newsletter.id, updated_newsletter_data
)
assert updated_newsletter.name == "Updated Newsletter"
assert len(updated_newsletter.senders) == 1
@@ -201,6 +218,7 @@ def test_delete_newsletter(db_session: Session):
newsletter = create_newsletter(db_session, newsletter_data)
from app.crud.newsletters import delete_newsletter
deleted_newsletter = delete_newsletter(db_session, newsletter.id)
assert deleted_newsletter.id == newsletter.id
@@ -208,4 +226,5 @@ def test_delete_newsletter(db_session: Session):
# Verify it's actually deleted
from app.crud.newsletters import get_newsletter
assert get_newsletter(db_session, newsletter.id) is None

View File

@@ -164,9 +164,17 @@ def test_get_newsletter_feed(client: TestClient):
newsletter_id = create_response.json()["id"]
# Add some entries to the newsletter
entry_data_1 = {"subject": "Test Entry 1", "body": "<p>Content 1</p>"}
entry_data_1 = {
"subject": "Test Entry 1",
"body": "<p>Content 1</p>",
"message_id": f"<entry1_{uuid.uuid4()}@test.com>",
}
client.post(f"/newsletters/{newsletter_id}/entries", json=entry_data_1)
entry_data_2 = {"subject": "Test Entry 2", "body": "<p>Content 2</p>"}
entry_data_2 = {
"subject": "Test Entry 2",
"body": "<p>Content 2</p>",
"message_id": f"<entry2_{uuid.uuid4()}@test.com>",
}
client.post(f"/newsletters/{newsletter_id}/entries", json=entry_data_2)
response = client.get(f"/feeds/{newsletter_id}")

View File

@@ -1,3 +1,5 @@
import uuid
from sqlalchemy.orm import Session
from app.crud.entries import create_entry
@@ -17,12 +19,16 @@ def test_generate_feed(db_session: Session):
# Create entries for the newsletter
entry1_data = EntryCreate(
subject="First Entry", body="<p>This is the first entry.</p>"
subject="First Entry",
body="<p>This is the first entry.</p>",
message_id=f"<{uuid.uuid4()}@test.com>",
)
create_entry(db_session, entry1_data, newsletter.id)
entry2_data = EntryCreate(
subject="Second Entry", body="<p>This is the second entry.</p>"
subject="Second Entry",
body="<p>This is the second entry.</p>",
message_id=f"<{uuid.uuid4()}@test.com>",
)
create_entry(db_session, entry2_data, newsletter.id)