From 2db493d83df72f61ec4759d72f1c76a554fa6ee6 Mon Sep 17 00:00:00 2001 From: nusquama Date: Wed, 12 Nov 2025 12:50:31 +0100 Subject: [PATCH] creation --- .../simple_eval_for_legal_benchmarking.json | 1 + 1 file changed, 1 insertion(+) create mode 100644 workflows/Simple Eval for Legal Benchmarking-4712/simple_eval_for_legal_benchmarking.json diff --git a/workflows/Simple Eval for Legal Benchmarking-4712/simple_eval_for_legal_benchmarking.json b/workflows/Simple Eval for Legal Benchmarking-4712/simple_eval_for_legal_benchmarking.json new file mode 100644 index 000000000..3e05b3a8c --- /dev/null +++ b/workflows/Simple Eval for Legal Benchmarking-4712/simple_eval_for_legal_benchmarking.json @@ -0,0 +1 @@ +{"meta":{"instanceId":"45e293393b5dd8437fb351e5b1ef5511ef67e6e0826a1c10b9b68be850b67593"},"nodes":[{"id":"5c25edcc-987c-4b60-b472-6c476dc866c6","name":"When clicking ‘Test workflow’","type":"n8n-nodes-base.manualTrigger","position":[-580,440],"parameters":{},"typeVersion":1},{"id":"60521a1e-3f00-4dd6-9651-a5f9ae1659ab","name":"Loop Over Items","type":"n8n-nodes-base.splitInBatches","position":[-20,-180],"parameters":{"options":{"reset":false},"batchSize":"=1"},"typeVersion":3},{"id":"8e0fd8c4-51a1-44fb-8524-034df4fd4fe5","name":"Wait","type":"n8n-nodes-base.wait","position":[1520,340],"webhookId":"4e47e9cb-5e6d-4020-a2db-f60c04f2f727","parameters":{"amount":0.5},"typeVersion":1.1},{"id":"0e457acc-5047-43ed-977f-f08714498ffc","name":"OpenRouter Chat Model","type":"@n8n/n8n-nodes-langchain.lmChatOpenRouter","position":[740,320],"parameters":{"model":"openai/gpt-4.1","options":{}},"credentials":{"openRouterApi":{"id":"ipzDVYsZqbum9bX4","name":"OpenRouter account 2"}},"typeVersion":1},{"id":"46ce90ac-6b58-4770-a6bd-48fccefc8713","name":"Extract from File","type":"n8n-nodes-base.extractFromFile","position":[500,20],"parameters":{"options":{},"operation":"pdf"},"typeVersion":1},{"id":"9e76d679-6f4b-419b-af36-771aba5b9e98","name":"Google Drive","type":"n8n-nodes-base.googleDrive","onError":"continueRegularOutput","position":[120,20],"parameters":{"fileId":{"__rl":true,"mode":"id","value":"={{ $json[\"URL\"].match(/[-\\w]{25,}/)[0] }}"},"options":{},"operation":"download"},"credentials":{"googleDriveOAuth2Api":{"id":"yej6mV2w6RslwOGo","name":"Google Drive account"}},"typeVersion":3,"alwaysOutputData":true},{"id":"8f2fc75f-ea5d-4700-b58b-b5f5e3e360e5","name":"Get Tests","type":"n8n-nodes-base.googleSheets","position":[-420,440],"parameters":{"options":{},"sheetName":{"__rl":true,"mode":"list","value":"gid=0","cachedResultUrl":"https://docs.google.com/spreadsheets/d/10l_gMtPsge00eTTltGrgvAo54qhh3_twEDsETrQLAGU/edit#gid=0","cachedResultName":"Tests"},"documentId":{"__rl":true,"mode":"list","value":"10l_gMtPsge00eTTltGrgvAo54qhh3_twEDsETrQLAGU","cachedResultUrl":"https://docs.google.com/spreadsheets/d/10l_gMtPsge00eTTltGrgvAo54qhh3_twEDsETrQLAGU/edit?usp=drivesdk","cachedResultName":"LLM Judge"}},"credentials":{"googleSheetsOAuth2Api":{"id":"04iXS2lwUVyzn6F2","name":"Google Sheets account"}},"typeVersion":4.5},{"id":"49b0cc43-30d3-4331-a608-4f51c9ac0efd","name":"Structured Output Parser","type":"@n8n/n8n-nodes-langchain.outputParserStructured","position":[900,320],"parameters":{"jsonSchemaExample":"{\n \"reasoning\": \"The Assistant fabricated a $1 million figure and a 12-month provision that are not found in the source. This breaches factual correctness and completeness. The output would mislead business stakeholders if used without correction.\",\n \"decision\": \"Fail\"\n}"},"typeVersion":1.2},{"id":"cf3cce00-33c4-4bcd-a2a2-75d7e49439d3","name":"Update Results","type":"n8n-nodes-base.googleSheets","position":[1220,140],"parameters":{"columns":{"value":{"ID":"={{ $('Loop Over Items').item.json[\"ID\"] }}","URL":"={{ $('Loop Over Items').item.json[\"URL\"] }}","Input":"={{ $('Loop Over Items').item.json[\"Input\"] }}","Output":"={{ $('Loop Over Items').item.json[\"Output\"] }}","Decision":"={{ $json.output.decision }}","Test No.":"={{ $('Loop Over Items').item.json[\"Test No.\"] }}","Reasoning":"={{ $json.output.reasoning }}","AI Platform":"={{ $('Loop Over Items').item.json[\"AI Platform\"] }}","Relevant Source Reference":"={{ $('Loop Over Items').item.json[\"Relevant Source Reference\"] }}"},"schema":[{"id":"ID","type":"string","display":true,"removed":false,"required":false,"displayName":"ID","defaultMatch":false,"canBeUsedToMatch":true},{"id":"Test No.","type":"string","display":true,"removed":false,"required":false,"displayName":"Test No.","defaultMatch":false,"canBeUsedToMatch":true},{"id":"AI Platform","type":"string","display":true,"required":false,"displayName":"AI Platform","defaultMatch":false,"canBeUsedToMatch":true},{"id":"Relevant Source Reference","type":"string","display":true,"removed":false,"required":false,"displayName":"Relevant Source Reference","defaultMatch":false,"canBeUsedToMatch":true},{"id":"URL","type":"string","display":true,"removed":false,"required":false,"displayName":"URL","defaultMatch":false,"canBeUsedToMatch":true},{"id":"Input","type":"string","display":true,"required":false,"displayName":"Input","defaultMatch":false,"canBeUsedToMatch":true},{"id":"Output","type":"string","display":true,"required":false,"displayName":"Output","defaultMatch":false,"canBeUsedToMatch":true},{"id":"Decision","type":"string","display":true,"removed":false,"required":false,"displayName":"Decision","defaultMatch":false,"canBeUsedToMatch":true},{"id":"Reasoning","type":"string","display":true,"required":false,"displayName":"Reasoning","defaultMatch":false,"canBeUsedToMatch":true}],"mappingMode":"defineBelow","matchingColumns":[],"attemptToConvertTypes":false,"convertFieldsToString":false},"options":{},"operation":"append","sheetName":{"__rl":true,"mode":"list","value":537199982,"cachedResultUrl":"https://docs.google.com/spreadsheets/d/10l_gMtPsge00eTTltGrgvAo54qhh3_twEDsETrQLAGU/edit#gid=537199982","cachedResultName":"Results"},"documentId":{"__rl":true,"mode":"list","value":"10l_gMtPsge00eTTltGrgvAo54qhh3_twEDsETrQLAGU","cachedResultUrl":"https://docs.google.com/spreadsheets/d/10l_gMtPsge00eTTltGrgvAo54qhh3_twEDsETrQLAGU/edit?usp=drivesdk","cachedResultName":"Info Extraction Tasks (LLM Judge)"}},"credentials":{"googleSheetsOAuth2Api":{"id":"04iXS2lwUVyzn6F2","name":"Google Sheets account"}},"typeVersion":4.5},{"id":"80a9da50-93df-476a-a39f-ffece642a934","name":"Sticky Note","type":"n8n-nodes-base.stickyNote","position":[-540,220],"parameters":{"color":6,"width":360,"height":180,"content":"## 1. Fetch test cases\nWe start by grabbing our list of test cases stored in a Google Sheet [here](https://docs.google.com/spreadsheets/d/10l_gMtPsge00eTTltGrgvAo54qhh3_twEDsETrQLAGU/edit?usp=sharing).\n\nWe only want the rows that connect to a PDF document, as DOCX downloads will need to be handled separately."},"typeVersion":1},{"id":"2bba0cc9-460d-48c3-bcc6-a79c2b00b23e","name":"Sticky Note1","type":"n8n-nodes-base.stickyNote","position":[-80,-400],"parameters":{"color":6,"width":260,"height":200,"content":"## 2. Loop through our data\nWe use a loop here to visually see our workflow going through each item one-by-one."},"typeVersion":1},{"id":"7ccb6426-e79e-4cdc-ac5f-59c022f8978d","name":"Sticky Note2","type":"n8n-nodes-base.stickyNote","position":[200,-180],"parameters":{"color":6,"width":360,"content":"## 3. Grab the PDF as text\nWe download the PDF from the Google Drive link in the Google Sheet, extracting the file as text for the next step. We filter out any files that do not return data."},"typeVersion":1},{"id":"6dc17e29-ea6c-460b-b0e0-99cc41977891","name":"Sticky Note3","type":"n8n-nodes-base.stickyNote","position":[700,-180],"parameters":{"color":6,"width":360,"height":280,"content":"## 4. Judge LLM outputs\nOur prompt judges the LLM input/output and decides if the LLM passed the test. We also ask for a reason why the judge made its decision, which we can use to refine our eval later.\n\nWe're using OpenRouter here, which lets us easily tweak which LLM we want to use.\n\nThe output parser makes sure that the output is in JSON format, making the data easy to parse in the next step."},"typeVersion":1},{"id":"1fd001a6-e67a-406c-b81c-c0a454ecd73e","name":"LLM Judge","type":"@n8n/n8n-nodes-langchain.chainLlm","position":[740,140],"parameters":{"text":"=INPUT:\n\n{\n \"task\": {{ $('Loop Over Items').item.json['Input '] }},\n \"source\": {{ $json.text }},\n \"output\": {{ $('Loop Over Items').item.json['Output '] }}\n}\n\nOUTPUT:","messages":{"messageValues":[{"message":"=You are an expert legal evaluator. Your role is to decide if each AI output for an information extraction task is fully accurate.\nInstructions:\nRead the extraction task and the source material carefully.\n\n\nExtract the correct answer for yourself, based only on the source material. (Do not include your answer in your output.)\n\n\nFor each AI output provided, do the following:\n\n\nCompare it against your own extraction.\n\n\nJudge “accuracy” as follows:\n\n\nPass: The output is complete (all required info present), faithful (statements fully supported by the source), and contains no hallucinated or omitted content.\n\n\nFail: Any omission, misstatement, unsupported addition, or distortion.\n\n\nIgnore differences in style, order, or formatting. Judge on factual substance only.\n\n\nFor each output, give:\n\n\nAccuracy: Pass or Fail\n\n\nReasoning: In 1-2 sentences, cite the specific content that was included, missed, or misstated. Be as objective and specific as possible.\n\n\nOutput Format:\n\nGive your answer in a JSON format. Don't include any other text than the JSON.\n\n{\n \"decision\": \"Pass/Fail\",\n \"reasoning\": \"[Concise explanation referencing the substance of the source]\"\n}\n\nGuidelines:\nUse your internal extraction for comparison, but do not include it in your output.\n\n\nFocus only on whether the output accurately and fully reflects the required information from the source.\n\n\nDo not consider writing style or format.\n\n\nJustify your decision with evidence from the source.\n\n\nExamples:\n\nAI Output 1:\n\n{\n \"decision\": \"Pass\",\n \"reasoning\": \"Correctly included the requirements for advance notice and refund as set out in the source, with no omissions.\"\n}\n\n\nAI Output 2:\n\n{\n \"decision\": \"Fail\",\n \"reasoning\": \"Missed the requirement that refunds are provided only as credit, not cash, making the output incomplete.\"\n}"}]},"promptType":"define","hasOutputParser":true},"typeVersion":1.4},{"id":"2e94c252-9973-4915-ab21-838681e7ea5c","name":"Sticky Note4","type":"n8n-nodes-base.stickyNote","position":[1160,-80],"parameters":{"color":6,"width":260,"height":180,"content":"## 5. Update results\nWe create a new row in our output sheet, containing our original data together with the judge decision/reasoning."},"typeVersion":1},{"id":"05d7a5c0-3ad5-4ec1-ae3f-f533e16f8027","name":"Sticky Note5","type":"n8n-nodes-base.stickyNote","position":[1460,140],"parameters":{"color":6,"width":220,"height":180,"content":"## 6. Pause between runs\nWe wait half a second, so that we don't hit the OpenAI API rate limit and get an error."},"typeVersion":1},{"id":"099b10f3-c9cf-4fa0-9a07-ccb73ea11542","name":"Sticky Note6","type":"n8n-nodes-base.stickyNote","position":[-600,640],"parameters":{"width":460,"height":280,"content":"## Data format\nOur Tests Sheet contains the following columns:\n- ID: A unique identifier for each row\n- Test No.: The test that the LLM was given\n- AI Platform: The LLM that was given the test.\n- Relevant Source: The file name of the source document that was given to the LLM.\n- URL: The Google Drive URL where the file is stored.\n- Input: The input prompt that the LLM was given.\n- Output: The response that the LLM gave."},"typeVersion":1},{"id":"65532c6d-5638-432f-8f6a-a5a02b1aaf08","name":"Is PDF?","type":"n8n-nodes-base.if","position":[-260,440],"parameters":{"options":{},"conditions":{"options":{"version":2,"leftValue":"","caseSensitive":true,"typeValidation":"loose"},"combinator":"and","conditions":[{"id":"1609d1f6-2142-4965-8d6b-01cfa53251c4","operator":{"type":"string","operation":"contains"},"leftValue":"={{ $json['Relevant Source Reference'] }}","rightValue":".pdf"}]},"looseTypeValidation":true},"typeVersion":2.2},{"id":"17aa2c76-7ea7-41fe-84ed-4e485ef4db5e","name":"Is a file?","type":"n8n-nodes-base.if","position":[300,20],"parameters":{"options":{},"conditions":{"options":{"version":2,"leftValue":"","caseSensitive":true,"typeValidation":"strict"},"combinator":"and","conditions":[{"id":"ad7272d4-0840-4c71-bee4-64faf944e3ca","operator":{"type":"object","operation":"notEmpty","singleValue":true},"leftValue":"={{ $binary }}","rightValue":""}]}},"typeVersion":2.2}],"pinData":{},"connections":{"Wait":{"main":[[{"node":"Loop Over Items","type":"main","index":0}]]},"Is PDF?":{"main":[[{"node":"Loop Over Items","type":"main","index":0}]]},"Get Tests":{"main":[[{"node":"Is PDF?","type":"main","index":0}]]},"LLM Judge":{"main":[[{"node":"Update Results","type":"main","index":0}]]},"Is a file?":{"main":[[{"node":"Extract from File","type":"main","index":0}],[{"node":"Loop Over Items","type":"main","index":0}]]},"Google Drive":{"main":[[{"node":"Is a file?","type":"main","index":0}]]},"Update Results":{"main":[[{"node":"Wait","type":"main","index":0}]]},"Loop Over Items":{"main":[[],[{"node":"Google Drive","type":"main","index":0}]]},"Extract from File":{"main":[[{"node":"LLM Judge","type":"main","index":0}]]},"OpenRouter Chat Model":{"ai_languageModel":[[{"node":"LLM Judge","type":"ai_languageModel","index":0}]]},"Structured Output Parser":{"ai_outputParser":[[{"node":"LLM Judge","type":"ai_outputParser","index":0}]]},"When clicking ‘Test workflow’":{"main":[[{"node":"Get Tests","type":"main","index":0}]]}}} \ No newline at end of file