diff --git a/workflows/Turn websites into a Google Sheets database with MrScraper and Gmail-13798/13798-turn-websites-into-a-google-sheets-database-with-mrscraper-and-gmail.webp b/workflows/Turn websites into a Google Sheets database with MrScraper and Gmail-13798/13798-turn-websites-into-a-google-sheets-database-with-mrscraper-and-gmail.webp new file mode 100644 index 000000000..efff77c81 --- /dev/null +++ b/workflows/Turn websites into a Google Sheets database with MrScraper and Gmail-13798/13798-turn-websites-into-a-google-sheets-database-with-mrscraper-and-gmail.webp @@ -0,0 +1 @@ +{"id":"uKd0S9PlArXXwgan","meta":{"instanceId":"ff80ff7708e50014ab81fa837934b47761ca37bb76e027238bca430a67bf5090","templateCredsSetupCompleted":true},"name":"Turn Internet Into Database","tags":[],"nodes":[{"id":"f2cbcbb3-d744-4805-9f21-bb373e57294e","name":"Extract All Url ","type":"n8n-nodes-base.code","position":[512,464],"parameters":{"language":"python","pythonCode":"items = []\nurls = set()\n\n# Loop through ALL input items, not just one\nfor input_item in _input.all():\n payload = input_item.json\n \n # Extract URLs from response data\n response = payload.get(\"data\", {}).get(\"response\") or []\n for page in response:\n listings = page.get(\"data\", {}).get(\"data\") or []\n for listing in listings:\n url = listing.get(\"url\")\n if isinstance(url, str) and url.strip():\n urls.add(url)\n \n # Extract the search link\n search_link = payload.get(\"data\", {}).get(\"link\")\n if isinstance(search_link, str) and search_link.strip():\n urls.add(search_link)\n\n# Convert set to list of items\nfor url in urls:\n items.append({\n \"json\": {\n \"url\": url\n }\n })\n\nreturn items"},"typeVersion":2},{"id":"46c25881-e096-472d-856b-58b19889ca47","name":"When clicking ‘Execute workflow’","type":"n8n-nodes-base.manualTrigger","position":[-288,480],"parameters":{},"typeVersion":1},{"id":"ba27b8ff-de07-445b-99e1-f91b4d5185f9","name":"Flatten Object","type":"n8n-nodes-base.code","position":[1168,464],"parameters":{"jsCode":"// Recursive function to flatten nested objects\nfunction flattenObject(obj, prefix = '', result = {}) {\n for (const key in obj) {\n if (!Object.prototype.hasOwnProperty.call(obj, key)) continue;\n\n const newKey = prefix ? `${prefix}_${key}` : key;\n const value = obj[key];\n\n if (value === null || value === undefined) {\n result[newKey] = null;\n } else if (Array.isArray(value)) {\n // Join array values\n result[newKey] = value.length ? value.join(', ') : null;\n } else if (typeof value === 'object' && !(value instanceof Date)) {\n flattenObject(value, newKey, result);\n } else {\n result[newKey] = value;\n }\n }\n return result;\n}\n\n// 1. Get ALL input items (important)\nconst items = $input.all();\n\n// 2. Flatten EACH item\nconst output = items.map(item => {\n const flattened = flattenObject(item.json);\n return { json: flattened };\n});\n\nreturn output;\n"},"executeOnce":false,"typeVersion":2},{"id":"bf8dd01b-029a-40e0-bbce-05c72c007843","name":"Looping Listing Page url","type":"n8n-nodes-base.splitInBatches","position":[320,480],"parameters":{"options":{"reset":false}},"typeVersion":3},{"id":"68262d46-6f6f-4180-9d68-7c0e1ac2db7f","name":"Looping Detail Page url","type":"n8n-nodes-base.splitInBatches","position":[976,496],"parameters":{"options":{"reset":false}},"retryOnFail":false,"typeVersion":3},{"id":"b17fd342-7d85-4eab-9e2f-d6a1c7b13a1e","name":"Sticky Note","type":"n8n-nodes-base.stickyNote","position":[-1152,416],"parameters":{"width":720,"height":976,"content":"## Phase 0: Setup and Configuration\n\n### Goal\n\nPrepare your MrScraper agents + n8n credentials so the workflow can run reliably.\n\n### What you need before running\n\n1. **Create the scrapers in your MrScraper account first** (one per agent):\n\n * **Map Agent Scraper** → for crawling/discovering URLs\n * **Listing Agent Scraper** → for scraping listing/search pages\n * **General Agent Scraper** → for scraping detail pages\n\n Each scraper will have its own **`scraperId`**. You’ll use these IDs in n8n to **Rerun** each agent and fetch results.\n2. **Enable AI Scraper API access** in your MrScraper account (so n8n can run scrapers via API).\n3. **Create credentials for MrScraper, Gmail, and Google.**\n\n### What To Do\n\n1. Add your **MrScraper API token** into your n8n credentials / “Input re-run API” node.\n2. Fill in all three **scraper IDs**:\n\n * `MAP_SCRAPER_ID`\n * `LISTING_SCRAPER_ID`\n * `GENERAL_SCRAPER_ID`\n3. Set your **target domain URL** (example: `https://example.com`).\n4. Define **URL patterns**:\n\n * Listing URL pattern (example: `/category/` or `/search?`)\n * Detail URL pattern (example: `/product/` or `/property/`)\n5. Set safety limits:\n\n * `Map Agent crawl limits: `maxDepth`, `maxPages` and Url pattern\n * `Looping Batch`\n \n6. Connect **Google Sheets** (spreadsheet + sheet tab, append/upsert strategy).\n7. Connect **Gmail** (recipient, subject format, send rules: always / only new rows / only errors).\n\n\n"},"typeVersion":1},{"id":"ba037af9-3691-4760-90f9-8110099d86f7","name":"Sticky Note8","type":"n8n-nodes-base.stickyNote","position":[-400,656],"parameters":{"color":2,"width":624,"height":288,"content":"### Goal\nFind listing/search pages automatically from a domain.\n\n### What To Do\n1. Run **Map Agent (Rerun)** using your domain as input (Please enter the maximum depth, maximum pages, and include pattern (includePattern) as needed.)\n2. Collect all discovered URLs from the Map results.\n3. Filter URLs using your **listing pattern** (example: only URLs containing `/category/`).\n\n### Output\nA clean list of listing/search page URLs.\n"},"typeVersion":1},{"id":"e3ecd221-28f3-4657-a78d-1c495a5d940c","name":"Run listing agent scraper","type":"n8n-nodes-mrscraper.mrscraper","position":[688,512],"parameters":{"url":"=// Input Your url (required)","timeout":720,"maxPages":2,"operation":"listingAgent","scraperId":"=// Input Your url (required)","requestOptions":{}},"credentials":{"mrscraperApi":{"id":"9dYvWOEWBGjzZdW8","name":"riandra"}},"typeVersion":1},{"id":"3711e6c8-3586-4fc7-9e19-c2569325e405","name":"Run general agent scraper","type":"n8n-nodes-mrscraper.mrscraper","position":[1312,496],"parameters":{"url":"=// Input Your url (required)","operation":"generalAgent","scraperId":"=// Input Your scraperId from Mrscraper (required)","requestOptions":{}},"credentials":{"mrscraperApi":{"id":"9dYvWOEWBGjzZdW8","name":"riandra"}},"typeVersion":1},{"id":"c20c557a-9d73-4671-a476-3e9e54905e85","name":"Run map agent scraper","type":"n8n-nodes-mrscraper.mrscraper","position":[-16,480],"parameters":{"url":"=// Input Your url (required)","limit":1000,"operation":"mapAgent","scraperId":"=// Input Your scraperId from mrscraper (required)","requestOptions":{},"excludePatterns":"=// Input Your Exclude Pattern (Optional)","includePatterns":"=// Input Your Include Pattern (optional)"},"credentials":{"mrscraperApi":{"id":"9dYvWOEWBGjzZdW8","name":"riandra"}},"typeVersion":1},{"id":"5ea9af70-7382-4670-bc6c-9ba0c73c2de2","name":"Send a message","type":"n8n-nodes-base.gmail","position":[1936,480],"webhookId":"0c4c9c7f-124b-47fa-b052-f9bb25602601","parameters":{"sendTo":"// Input where you want to send","message":"// Input The message","options":{},"subject":"// Input The subject of the message","emailType":"text"},"credentials":{"gmailOAuth2":{"id":"ZCn7giWqNBeFwnWX","name":"Gmail account"}},"typeVersion":2.2},{"id":"9c70a124-d1d0-40ad-885e-df580a26b952","name":"Sticky Note9","type":"n8n-nodes-base.stickyNote","position":[-400,416],"parameters":{"color":2,"width":624,"height":224,"content":"## Phase 1: Discover URL (Crawling)"},"typeVersion":1},{"id":"c5a12156-b27a-4931-9b67-784ff72636b8","name":"Sticky Note10","type":"n8n-nodes-base.stickyNote","position":[240,416],"parameters":{"color":5,"width":624,"height":304,"content":"## Phase 2: Scrape Listing Page\n"},"typeVersion":1},{"id":"8ddced76-2ae6-45a8-9c86-5a4c9b4b3c80","name":"Sticky Note11","type":"n8n-nodes-base.stickyNote","position":[240,736],"parameters":{"color":5,"width":624,"height":320,"content":"### Goal\nFrom each listing/search page, extract **detail page URLs**.\n\n### What To Do\n1. Loop through each listing page URL from Phase 1.\n2. For each URL, call the **Listing Agent (Rerun)** using the scraperId from the mrscraper platform that was previously created.\n3. Extract all detail URLs from the Listing results.\n4. Normalize URLs (convert to absolute), then deduplicate them.\n\n### Output\nA deduped list of detail page URLs."},"typeVersion":1},{"id":"6be9e8e7-f239-4976-bdd2-2d7ef235024e","name":"Sticky Note12","type":"n8n-nodes-base.stickyNote","position":[880,416],"parameters":{"color":6,"width":624,"height":304,"content":"## Phase 3: Scrape Detail Data\n"},"typeVersion":1},{"id":"109b0272-6dae-4771-a912-a1ce9e2a3fb1","name":"Sticky Note13","type":"n8n-nodes-base.stickyNote","position":[880,736],"parameters":{"color":6,"width":624,"height":368,"content":"### Goal\nExtract structured fields from each detail page.\n\n### What To Do\n1. Loop through each detail URL from Phase 2.\n2. For each URL, call the **General Agent (Rerun)** using the scraperId from the mrscraper platform that was previously created.\n3. Select the fields you want to keep (examples: title, price, location, attributes, description).\n4. Normalize the output:\n * flatten nested JSON\n * format arrays into readable text\n * add metadata (source URL, scrape timestamp)\n\n### Output\nOne structured record per detail page (ready to export)."},"typeVersion":1},{"id":"da5aae6f-ec98-4072-ac6a-04533ec5704c","name":"Sticky Note14","type":"n8n-nodes-base.stickyNote","position":[1520,656],"parameters":{"color":3,"width":592,"height":416,"content":"### Goal\nSave results into Google Sheets and send a Gmail summary/alert.\n\n### What To Do\n1. Write rows into **Google Sheets**:\n * Append new rows, or upsert using a unique key (recommended: `source_url`).\n2. Build a run summary:\n * total listing pages processed\n * total detail pages scraped\n * total rows inserted/updated\n * errors (if any)\n3. Send a Gmail notification:\n * success digest (optional)\n * or alert only for new items / errors / threshold triggers\n\n### Output\nUpdated spreadsheet + email notification."},"typeVersion":1},{"id":"434625d8-040d-46c5-a367-4e43731d02aa","name":"Sticky Note15","type":"n8n-nodes-base.stickyNote","position":[1520,416],"parameters":{"color":3,"width":592,"height":224,"content":"## Phase 4: Export to Spreadsheet + Notify via Gmail"},"typeVersion":1},{"id":"8c8ec7f3-5318-4984-b03f-d866053a2dfa","name":"Append or update row in sheet","type":"n8n-nodes-base.googleSheets","position":[1648,480],"parameters":{"columns":{"value":{},"schema":[],"mappingMode":"autoMapInputData","matchingColumns":[],"attemptToConvertTypes":false,"convertFieldsToString":false},"options":{},"operation":"appendOrUpdate","sheetName":{"__rl":true,"mode":"url","value":"// Input Your sheets Url (required)"},"documentId":{"__rl":true,"mode":"url","value":"// Input Your Document Url (required)"}},"credentials":{"googleSheetsOAuth2Api":{"id":"X520LqyN5WppJ7Ht","name":"Google Sheets Maul"}},"typeVersion":4.7}],"active":false,"pinData":{},"settings":{"availableInMCP":false,"executionOrder":"v1"},"versionId":"c4e35a56-d1e3-466a-a78a-9c84c24f25df","connections":{"Flatten Object":{"main":[[{"node":"Append or update row in sheet","type":"main","index":0}]]},"Extract All Url ":{"main":[[{"node":"Looping Detail Page url","type":"main","index":0}]]},"Run map agent scraper":{"main":[[{"node":"Looping Listing Page url","type":"main","index":0}]]},"Looping Detail Page url":{"main":[[{"node":"Flatten Object","type":"main","index":0}],[{"node":"Run general agent scraper","type":"main","index":0}]]},"Looping Listing Page url":{"main":[[{"node":"Extract All Url ","type":"main","index":0}],[{"node":"Run listing agent scraper","type":"main","index":0}]]},"Run general agent scraper":{"main":[[{"node":"Looping Detail Page url","type":"main","index":0}]]},"Run listing agent scraper":{"main":[[{"node":"Looping Listing Page url","type":"main","index":0}]]},"Append or update row in sheet":{"main":[[{"node":"Send a message","type":"main","index":0}]]},"When clicking ‘Execute workflow’":{"main":[[{"node":"Run map agent scraper","type":"main","index":0}]]}}} \ No newline at end of file