feat: text content extraction

This commit is contained in:
Leon
2025-07-16 21:21:06 +02:00
parent 265e818780
commit 65902ed161
11 changed files with 568 additions and 446 deletions

View File

@@ -50,7 +50,9 @@ def get_newsletters(db: Session, skip: int = 0, limit: int = 100):
def create_newsletter(db: Session, newsletter: NewsletterCreate):
"""Create a new newsletter."""
logger.info(f"Creating new newsletter with name '{newsletter.name}'")
db_newsletter = Newsletter(name=newsletter.name)
db_newsletter = Newsletter(
name=newsletter.name, extract_content=newsletter.extract_content
)
db.add(db_newsletter)
db.commit()
db.refresh(db_newsletter)

View File

@@ -12,6 +12,7 @@ class Newsletter(Base):
id = Column(Integer, primary_key=True, index=True)
name = Column(String)
is_active = Column(Boolean, default=True)
extract_content = Column(Boolean, default=False)
senders = relationship(
"Sender", back_populates="newsletter", cascade="all, delete-orphan"

View File

@@ -28,6 +28,7 @@ class NewsletterBase(BaseModel):
"""Base schema for a newsletter."""
name: str
extract_content: bool = False
class NewsletterCreate(NewsletterBase):

View File

@@ -2,6 +2,7 @@ import email
import imaplib
from email.header import decode_header, make_header
import trafilatura
from sqlalchemy.orm import Session
from app.core.logging import get_logger
@@ -114,6 +115,10 @@ def process_emails(db: Session):
pass
final_body = html or body
if newsletter.extract_content:
extracted_body = trafilatura.extract(final_body)
if extracted_body:
final_body = extracted_body
entry = EntryCreate(
subject=subject, body=final_body, message_id=message_id