userscript to export all emails, attachments, and threads from an outlook exchange inbox!
exchange-exporter.js
1// ==UserScript==
2// @name Outlook Web Message Harvester
3// @namespace http://tampermonkey.net/
4// @version 0.1
5// @description Iterate every message in the Inbox, capture the reading-pane text, and batch-export every 100 entries as JSON files.
6// @author Codex
7// @match file:///home/matmanna/projects/exchange-exporter/page.html*
8// @match https://outlook.office.com/*
9// @grant none
10// ==/UserScript==
11
12(function () {
13 'use strict';
14
15 const DEFAULT_BATCH_SIZE = 100;
16 const SCROLL_PAUSE = 1200;
17 const WAIT_POLL = 300;
18 const MAX_SCROLL_FAILURES = 4;
19
20 const state = {
21 running: false,
22 processed: new Set(),
23 collected: [],
24 batchIndex: 0,
25 attachmentDownloads: new Set(),
26 };
27
28 function sleep(ms) {
29 return new Promise((resolve) => setTimeout(resolve, ms));
30 }
31
32 function sanitizeFileName(name) {
33 if (!name) {
34 return 'attachment';
35 }
36 return name
37 .toString()
38 .replace(/[<>:\\"/|?*\u0000-\u001F]+/g, '_')
39 .replace(/\s+/g, ' ')
40 .trim()
41 .slice(0, 190);
42 }
43
44 function triggerDownload(blob, fileName) {
45 const url = URL.createObjectURL(blob);
46 const anchor = document.createElement('a');
47 anchor.href = url;
48 anchor.download = fileName;
49 document.body.appendChild(anchor);
50 anchor.click();
51 document.body.removeChild(anchor);
52 URL.revokeObjectURL(url);
53 }
54
55 async function waitFor(predicate, timeout = 20000) {
56 const start = Date.now();
57 while (Date.now() - start < timeout) {
58 const result = await predicate();
59 if (result) {
60 return result;
61 }
62 await sleep(WAIT_POLL);
63 }
64 return null;
65 }
66
67 function downloadJson(payload, batchLabel) {
68 const blob = new Blob([payload], { type: 'application/json' });
69 triggerDownload(blob, `outlook-messages-${batchLabel}.json`);
70 }
71
72 function getRowId(meta) {
73 if (meta?.id) {
74 return meta.id;
75 }
76 return meta?.wrapper?.dataset?.convid || meta?.row?.dataset?.convid || meta?.wrapper?.id || meta?.row?.id;
77 }
78
79 function hydrateMessage(meta, readingPaneSubject, readingPaneBody) {
80 const row = meta.row;
81 const subjectFromList = row?.querySelector('[class*="lvHighlightSubjectClass"]')?.innerText?.trim();
82 const sender = row?.querySelector('[class*="lvHighlightFromClass"]')?.innerText?.trim();
83 const time = row?.querySelector('._lvv_M')?.innerText?.trim();
84 const snippet = row?.querySelector('._lvv_O span')?.innerText?.trim() || row?.querySelector('._lvv_N span')?.innerText?.trim();
85
86 return {
87 id: getRowId(meta),
88 subject: readingPaneSubject || subjectFromList || null,
89 sender: sender || null,
90 listSnippet: snippet || null,
91 listTime: time || null,
92 paneBodyText: readingPaneBody.text || null,
93 paneBodyHtml: readingPaneBody.html || null,
94 paneSubject: readingPaneSubject || null,
95 capturedAt: new Date().toISOString(),
96 };
97 }
98
99 async function captureReadingPane(expectedSubject) {
100 const bodyEl = document.getElementById('Item.MessageUniqueBody');
101 if (!bodyEl) {
102 return { subject: null, text: null, html: null };
103 }
104
105 await waitFor(() => {
106 const subjectEl = document.querySelector('[autoid="_rp_4"], .rpHighlightSubjectClass');
107 const currentSubject = subjectEl?.innerText?.trim();
108 if (!currentSubject) {
109 return false;
110 }
111 if (!expectedSubject) {
112 return currentSubject;
113 }
114 if (currentSubject === expectedSubject || expectedSubject.includes(currentSubject) || currentSubject.includes(expectedSubject)) {
115 return currentSubject;
116 }
117 return false;
118 }, 10000);
119
120 const subjectEl = document.querySelector('[autoid="_rp_4"], .rpHighlightSubjectClass');
121 const subjectText = subjectEl?.innerText?.trim() || null;
122 const text = bodyEl?.innerText?.trim() || null;
123 const html = bodyEl?.innerHTML || null;
124
125 return { subject: subjectText, text, html };
126 }
127
128 async function clickRow(row) {
129 row.scrollIntoView({ block: 'center', inline: 'nearest' });
130 row.click();
131 await sleep(400);
132 }
133
134 function getMessageRows() {
135 const wrappers = Array.from(document.querySelectorAll('[data-convid]'));
136 const rows = [];
137
138 for (const wrapper of wrappers) {
139 const rowElement = wrapper.querySelector('[role="option"]') || wrapper;
140 if (!rowElement) {
141 continue;
142 }
143 if (rowElement.getAttribute('aria-hidden') === 'true') {
144 continue;
145 }
146 if (rowElement.offsetWidth === 0 || rowElement.offsetHeight === 0) {
147 continue;
148 }
149
150 rows.push({ wrapper, row: rowElement, id: getRowId({ wrapper, row: rowElement }) });
151 }
152
153 return rows;
154 }
155
156 function getScrollContainer(sampleRow) {
157 return sampleRow?.closest('[class*="scrollContainer"]') || document.documentElement;
158 }
159
160 async function scrollToBottom(container) {
161 const target = container === document.documentElement ? document.documentElement : container;
162 target.scrollTop = target.scrollHeight;
163 await sleep(SCROLL_PAUSE);
164 }
165
166 async function processRow(meta, batchSize) {
167 const rowId = getRowId(meta);
168 if (!rowId || state.processed.has(rowId)) {
169 return;
170 }
171
172 state.processed.add(rowId);
173
174 const subjectFromList = meta.row?.querySelector('[class*="lvHighlightSubjectClass"]')?.innerText?.trim();
175
176 await clickRow(meta.row);
177 const pane = await captureReadingPane(subjectFromList);
178
179 const message = hydrateMessage(meta, pane.subject, pane);
180 state.collected.push(message);
181
182 if (state.collected.length >= batchSize) {
183 state.batchIndex += 1;
184 downloadJson(JSON.stringify(state.collected, null, 2), `${state.batchIndex}-${new Date().toISOString().replace(/[:.]/g, '-')}`);
185 state.collected = [];
186 }
187 }
188
189 async function runExtraction(batchSize = DEFAULT_BATCH_SIZE) {
190 if (state.running) {
191 console.log('[OutlookExtractor] already running.');
192 return;
193 }
194 state.running = true;
195 try {
196 const firstRowWrapper = await waitFor(() => document.querySelector('[data-convid]'), 30000);
197 if (!firstRowWrapper) {
198 console.warn('[OutlookExtractor] message pane never appeared.');
199 return;
200 }
201 const baseContainer = getScrollContainer(firstRowWrapper.querySelector('[role="option"]') || firstRowWrapper);
202 let scrollFailures = 0;
203
204 while (true) {
205 const rows = getMessageRows();
206 const unprocessed = rows.filter((row) => !state.processed.has(getRowId(row)));
207
208 if (unprocessed.length === 0) {
209 scrollFailures += 1;
210 } else {
211 scrollFailures = 0;
212 }
213
214 for (const row of unprocessed) {
215 await processRow(row, batchSize);
216 }
217
218 if (scrollFailures >= MAX_SCROLL_FAILURES) {
219 break;
220 }
221
222 await scrollToBottom(baseContainer);
223 }
224 } catch (error) {
225 console.error('[OutlookExtractor] failed', error);
226 } finally {
227 if (state.collected.length) {
228 state.batchIndex += 1;
229 downloadJson(JSON.stringify(state.collected, null, 2), `${state.batchIndex}-${new Date().toISOString().replace(/[:.]/g, '-')}`);
230 state.collected = [];
231 }
232 state.running = false;
233 console.log('[OutlookExtractor] finished.');
234 }
235 }
236
237 window.outlookEmailExtractor = {
238 start: (batchSize) => runExtraction(batchSize),
239 isRunning: () => state.running,
240 };
241
242 // start automatically once the page settles
243 setTimeout(() => runExtraction(), 5000);
244})();