userscript to export all emails, attachments, and threads from an outlook exchange inbox!
exchange-exporter.js
244 lines 8.5 kB view raw
1// ==UserScript== 2// @name Outlook Web Message Harvester 3// @namespace http://tampermonkey.net/ 4// @version 0.1 5// @description Iterate every message in the Inbox, capture the reading-pane text, and batch-export every 100 entries as JSON files. 6// @author Codex 7// @match file:///home/matmanna/projects/exchange-exporter/page.html* 8// @match https://outlook.office.com/* 9// @grant none 10// ==/UserScript== 11 12(function () { 13 'use strict'; 14 15 const DEFAULT_BATCH_SIZE = 100; 16 const SCROLL_PAUSE = 1200; 17 const WAIT_POLL = 300; 18 const MAX_SCROLL_FAILURES = 4; 19 20 const state = { 21 running: false, 22 processed: new Set(), 23 collected: [], 24 batchIndex: 0, 25 attachmentDownloads: new Set(), 26 }; 27 28 function sleep(ms) { 29 return new Promise((resolve) => setTimeout(resolve, ms)); 30 } 31 32 function sanitizeFileName(name) { 33 if (!name) { 34 return 'attachment'; 35 } 36 return name 37 .toString() 38 .replace(/[<>:\\"/|?*\u0000-\u001F]+/g, '_') 39 .replace(/\s+/g, ' ') 40 .trim() 41 .slice(0, 190); 42 } 43 44 function triggerDownload(blob, fileName) { 45 const url = URL.createObjectURL(blob); 46 const anchor = document.createElement('a'); 47 anchor.href = url; 48 anchor.download = fileName; 49 document.body.appendChild(anchor); 50 anchor.click(); 51 document.body.removeChild(anchor); 52 URL.revokeObjectURL(url); 53 } 54 55 async function waitFor(predicate, timeout = 20000) { 56 const start = Date.now(); 57 while (Date.now() - start < timeout) { 58 const result = await predicate(); 59 if (result) { 60 return result; 61 } 62 await sleep(WAIT_POLL); 63 } 64 return null; 65 } 66 67 function downloadJson(payload, batchLabel) { 68 const blob = new Blob([payload], { type: 'application/json' }); 69 triggerDownload(blob, `outlook-messages-${batchLabel}.json`); 70 } 71 72 function getRowId(meta) { 73 if (meta?.id) { 74 return meta.id; 75 } 76 return meta?.wrapper?.dataset?.convid || meta?.row?.dataset?.convid || meta?.wrapper?.id || meta?.row?.id; 77 } 78 79 function hydrateMessage(meta, readingPaneSubject, readingPaneBody) { 80 const row = meta.row; 81 const subjectFromList = row?.querySelector('[class*="lvHighlightSubjectClass"]')?.innerText?.trim(); 82 const sender = row?.querySelector('[class*="lvHighlightFromClass"]')?.innerText?.trim(); 83 const time = row?.querySelector('._lvv_M')?.innerText?.trim(); 84 const snippet = row?.querySelector('._lvv_O span')?.innerText?.trim() || row?.querySelector('._lvv_N span')?.innerText?.trim(); 85 86 return { 87 id: getRowId(meta), 88 subject: readingPaneSubject || subjectFromList || null, 89 sender: sender || null, 90 listSnippet: snippet || null, 91 listTime: time || null, 92 paneBodyText: readingPaneBody.text || null, 93 paneBodyHtml: readingPaneBody.html || null, 94 paneSubject: readingPaneSubject || null, 95 capturedAt: new Date().toISOString(), 96 }; 97 } 98 99 async function captureReadingPane(expectedSubject) { 100 const bodyEl = document.getElementById('Item.MessageUniqueBody'); 101 if (!bodyEl) { 102 return { subject: null, text: null, html: null }; 103 } 104 105 await waitFor(() => { 106 const subjectEl = document.querySelector('[autoid="_rp_4"], .rpHighlightSubjectClass'); 107 const currentSubject = subjectEl?.innerText?.trim(); 108 if (!currentSubject) { 109 return false; 110 } 111 if (!expectedSubject) { 112 return currentSubject; 113 } 114 if (currentSubject === expectedSubject || expectedSubject.includes(currentSubject) || currentSubject.includes(expectedSubject)) { 115 return currentSubject; 116 } 117 return false; 118 }, 10000); 119 120 const subjectEl = document.querySelector('[autoid="_rp_4"], .rpHighlightSubjectClass'); 121 const subjectText = subjectEl?.innerText?.trim() || null; 122 const text = bodyEl?.innerText?.trim() || null; 123 const html = bodyEl?.innerHTML || null; 124 125 return { subject: subjectText, text, html }; 126 } 127 128 async function clickRow(row) { 129 row.scrollIntoView({ block: 'center', inline: 'nearest' }); 130 row.click(); 131 await sleep(400); 132 } 133 134 function getMessageRows() { 135 const wrappers = Array.from(document.querySelectorAll('[data-convid]')); 136 const rows = []; 137 138 for (const wrapper of wrappers) { 139 const rowElement = wrapper.querySelector('[role="option"]') || wrapper; 140 if (!rowElement) { 141 continue; 142 } 143 if (rowElement.getAttribute('aria-hidden') === 'true') { 144 continue; 145 } 146 if (rowElement.offsetWidth === 0 || rowElement.offsetHeight === 0) { 147 continue; 148 } 149 150 rows.push({ wrapper, row: rowElement, id: getRowId({ wrapper, row: rowElement }) }); 151 } 152 153 return rows; 154 } 155 156 function getScrollContainer(sampleRow) { 157 return sampleRow?.closest('[class*="scrollContainer"]') || document.documentElement; 158 } 159 160 async function scrollToBottom(container) { 161 const target = container === document.documentElement ? document.documentElement : container; 162 target.scrollTop = target.scrollHeight; 163 await sleep(SCROLL_PAUSE); 164 } 165 166 async function processRow(meta, batchSize) { 167 const rowId = getRowId(meta); 168 if (!rowId || state.processed.has(rowId)) { 169 return; 170 } 171 172 state.processed.add(rowId); 173 174 const subjectFromList = meta.row?.querySelector('[class*="lvHighlightSubjectClass"]')?.innerText?.trim(); 175 176 await clickRow(meta.row); 177 const pane = await captureReadingPane(subjectFromList); 178 179 const message = hydrateMessage(meta, pane.subject, pane); 180 state.collected.push(message); 181 182 if (state.collected.length >= batchSize) { 183 state.batchIndex += 1; 184 downloadJson(JSON.stringify(state.collected, null, 2), `${state.batchIndex}-${new Date().toISOString().replace(/[:.]/g, '-')}`); 185 state.collected = []; 186 } 187 } 188 189 async function runExtraction(batchSize = DEFAULT_BATCH_SIZE) { 190 if (state.running) { 191 console.log('[OutlookExtractor] already running.'); 192 return; 193 } 194 state.running = true; 195 try { 196 const firstRowWrapper = await waitFor(() => document.querySelector('[data-convid]'), 30000); 197 if (!firstRowWrapper) { 198 console.warn('[OutlookExtractor] message pane never appeared.'); 199 return; 200 } 201 const baseContainer = getScrollContainer(firstRowWrapper.querySelector('[role="option"]') || firstRowWrapper); 202 let scrollFailures = 0; 203 204 while (true) { 205 const rows = getMessageRows(); 206 const unprocessed = rows.filter((row) => !state.processed.has(getRowId(row))); 207 208 if (unprocessed.length === 0) { 209 scrollFailures += 1; 210 } else { 211 scrollFailures = 0; 212 } 213 214 for (const row of unprocessed) { 215 await processRow(row, batchSize); 216 } 217 218 if (scrollFailures >= MAX_SCROLL_FAILURES) { 219 break; 220 } 221 222 await scrollToBottom(baseContainer); 223 } 224 } catch (error) { 225 console.error('[OutlookExtractor] failed', error); 226 } finally { 227 if (state.collected.length) { 228 state.batchIndex += 1; 229 downloadJson(JSON.stringify(state.collected, null, 2), `${state.batchIndex}-${new Date().toISOString().replace(/[:.]/g, '-')}`); 230 state.collected = []; 231 } 232 state.running = false; 233 console.log('[OutlookExtractor] finished.'); 234 } 235 } 236 237 window.outlookEmailExtractor = { 238 start: (batchSize) => runExtraction(batchSize), 239 isRunning: () => state.running, 240 }; 241 242 // start automatically once the page settles 243 setTimeout(() => runExtraction(), 5000); 244})();