User:FL.YL.BANxS/archiver.js
外观
注意:保存之后,你必须清除浏览器缓存才能看到做出的更改。Google Chrome、Firefox、Microsoft Edge及Safari:按住⇧ Shift键并单击工具栏的“刷新”按钮。参阅Help:绕过浏览器缓存以获取更多帮助。
const process = require('process');
process.on('uncaughtException', console.error);
const http = require('http');
const https = require('https');
const querystring = require('querystring');
const tls = require('tls');
tls.DEFAULT_MIN_VERSION = 'TLSv1.2';
const USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0';
const SITE_PREFIX = 'https://zh.wikipedia.org';
const TITLE_PREFIX = SITE_PREFIX + '/wiki/';
const PAGE_RANDOM = TITLE_PREFIX + 'Special:随机页面';
const CAT_RANDOM = TITLE_PREFIX + 'Special:分类内随机';
const GENERATE_RANDOM = [
PAGE_RANDOM,
CAT_RANDOM + '/典范条目',
CAT_RANDOM + '/優良條目',
CAT_RANDOM + '/特色图片',
CAT_RANDOM + '/特色列表',
CAT_RANDOM + '/新闻动态',
'https://commons.wikimedia.org/wiki/Special:Random/File'
];
const INTERVAL_SECOND = 10;
const COUNT = 3;
const ED = (new Set());
Set.prototype.shift = function () {
for (var value of this) break;
this.delete(value);
return value;
};
Set.prototype.pop = function () {
for (var value of this);
this.delete(value);
return value;
};
String.prototype.indexOfL = function (value, n) {
let string = String(this).toLowerCase();
return string.indexOf(value.toLowerCase(), n);
};
String.prototype.replaceAll = function (from, to) {
let string = String(this);
if (Array.isArray(from)) {
for (let i of from) {
while (string.indexOf(i) !== -1)
string = string.replace(i, to);
}
} else {
if ((typeof from) !== 'string') from = String(from);
while (string.indexOf(from) !== -1)
string = string.replace(from, to);
}
return string;
};
String.prototype.exist = function (value, low = true) {
let string = String(this);
if (low) string = string.toLowerCase();
let s = false;
if (Array.isArray(value)) {
for (let i of value) {
if (low) i = i.toLowerCase();
if (string.indexOf(i) !== -1) {
s = true;
break;
}
}
} else {
if ((typeof value) !== 'string') value = String(value);
if (string.indexOf(value) !== -1) s = true;
}
return s;
}
Array.prototype.sum = function () {
let sum = 0;
for (let i of this) {
if ((!i) && i != 0) return NaN;
let ii = Number(i);
if (Number.isNaN(ii)) return NaN;
sum += ii;
}
return sum;
}
Array.prototype.avg = function () {
return this.sum() / this.length;
}
const date = (raw) => {
let second = Math.floor(raw);
let minute = Math.floor(second / 60);
let hour = Math.floor(minute / 60);
let day = Math.floor(hour / 24);
let week = Math.floor(day / 7);
let obj = {
raw,
second: second - (minute * 60),
minute: minute - (hour * 60),
hour: hour - (day * 24),
day: day - (week * 7),
week,
toString: (full) =>
((full || obj.week > 0) ? `${obj.week} Week ` : '') +
((full || obj.day > 0) ? `${obj.day} Day ` : '') +
((full || obj.hour > 0) ? `${obj.hour} Hour ` : '') +
((full || obj.minute > 0) ? `${obj.minute} Minute ` : '') +
`${obj.second} Second`
};
return obj;
}
const request = (() => {
let interval = 1000;
let handle = (options) => {
let { method, url, head, body, size, redirect, callback } = options;
method = method.toUpperCase();
url = (new URL(url));
body = Buffer.from(body);
{
let obj = {};
for (let i of Object.keys(head))
obj[i.toLowerCase()] = head[i];
let add = (name, val, verify = 1) => {
name = name.toLowerCase();
if (verify) {
if (Object.keys(obj).indexOf(name) !== -1) return;
}
obj[name] = val;
};
add('user-agent', USER_AGENT);
add('referer', `${url.origin}/`);
add('accept', '*/*');
if (body.length > 0) add('content-length', body.length, 0);
head = obj;
}
size = Number(size);
if (!Number.isSafeInteger(size)) throw (new Error('the format of "receive size" is invalid.'));
let rcount = 10;
let tcount = 10;
let info = {
url: (new Set()),
code: 0,
head: null,
body: Buffer.from([]),
redirect: 0,
error: []
};
let to = () => {
info.url.add(url.href);
let h = (() => {
switch (url.protocol) {
case 'http:': {
return http;
}
case 'https:': {
return https;
}
default: {
return null;
}
}
})();
if (!h) {
info.code = [-1, 'Protocol Invalid'];
info.head = null;
info.body = null;
if (callback) callback(info);
return;
}
let option = {
method,
headers: head
};
console.log(`FETCH: ${method} ${url.href}`);
let ed = 0;
let req = h.request((url.href.indexOf('%') === -1 ? encodeURI(url.href) : url.href), option, (reply) => {
let l = null;
if ((l = reply.headers.location) && redirect && (rcount--) > 0) {
reply.destroy();
++info.redirect;
let ll = l.toLowerCase();
if (ll.startsWith('http://') || ll.startsWith('https://')) {
url = (new URL(l));
setTimeout(to, 0);
} else {
let tmp = () => {
switch (l[0]) {
case '/': {
url = (new URL(url.origin + l));
break;
}
case '.': {
let tmp1 = (url.origin + url.pathname).split('').reverse();
if (tmp1[0] !== '/') tmp1 = tmp1.slice(tmp1.indexOf('/'));
tmp1 = tmp1.reverse().join('');
tmp1 += l;
url = (new URL(tmp1));
break;
}
default: {
l = './' + l;
setTimeout(tmp, 0);
return;
}
}
setTimeout(to, 0);
}
tmp();
}
return;
}
info.url = [...(info.url)];
info.code = [reply.statusCode, reply.statusMessage];
info.head = { ...(reply.headers) };
if (size === 0) {
ed = 1;
reply.destroy();
if (callback) callback(info);
return;
}
reply.on('data', (chunk) => {
info.body = Buffer.concat([info.body, chunk]);
if (size > 0 && info.body.length >= size) {
ed = 1;
reply.destroy();
info.body = info.body.slice(0, size);
if (callback) callback(info);
}
}).once('end', () => {
reply.destroy();
if (ed) return;
ed = 1;
if (callback) callback(info);
})
}).once('error', (err) => {
console.error(err, url.href);
if (ed) return;
ed = 1;
info.error.push(err);
if ((tcount--) <= 0) {
info.code = [-3, 'Exception'];
info.url = [...(info.url)];
info.head = null;
info.body = null;
callback(info);
return;
}
setTimeout(to, 10 * 1000);
});
if (body.length > 0) req.write(body);
req.end();
};
to();
};
let list = (new Set());
{
let tmp = () => {
let args = null;
do {
if (list.size <= 0) return;
args = list.shift();
} while (!args);
handle(...args);
};
setInterval(tmp, interval);
setTimeout(tmp, 0);
}
return (...args) => void list.add([...args]);
})();
const generatePagePool = (new Set());
const generatePage = (() => {
let pool = generatePagePool;
let interval = 1000;
let count = 1;
let random = () => {
if (pool.size > 1000) return;
let ed = 0;
https.get(encodeURI(GENERATE_RANDOM[0]), (reply) => {
reply.destroy();
if (ed) return;
ed = 1;
let url = (new URL(reply.headers['location']));
if (url.host === 'commons.wikimedia.org') {
saveList.add(url.href);
return;
}
pool.add(url.href);
}).on('error', (err) => {
console.error('RANDOM-GEN:', err);
});
GENERATE_RANDOM.push(GENERATE_RANDOM.shift());
}
for (let i = 0; i < count; ++i) {
setInterval(random, interval);
setTimeout(random, 0);
}
return (callback) => {
let tmp = () => {
let value = pool.shift();
if (value && (!ED.has(value))) {
ED.add(value);
if (callback) callback(value);
} else {
setTimeout(tmp, 100);
}
}
tmp();
};
})();
const getPage = (title = null, callback) => {
let handler = (url, callback) => {
url = (new URL(url));
url.search = '?action=raw';
request({
method: 'GET',
url,
head: {},
body: '',
size: -1,
redirect: false,
callback: (info) => {
let type = info.head['content-type'];
if (type && type.replaceAll(' ', '').split(';')[0] !== 'text/x-wiki') {
info.body = false;
}
if (callback) callback(info.url, info.body);
}
});
};
if (title) {
let url = (title.startsWith('http') ? title : TITLE_PREFIX + title);
handler(url, callback);
} else {
if (title === null) generatePage((url) => void handler(url, callback));
}
};
const getPageURL = (value) => {
let list = (new Set([
'web.archive.org',
'www.archive.org',
'archive.org',
'archive.today',
'archive.is',
'archive.fo',
'archive.md',
'archive.li',
'archive.ph',
'archive.vn'
]));
let v = String(value);
let url = (new Set());
for (
let start = null, end = null, tmp = v.slice(start = v.indexOfL('http'));
start > 0 && tmp.length > 0;
tmp = tmp.slice(start = tmp.indexOfL('http'))
) {
let tmp1 = tmp.toLowerCase();
if (!(tmp1.startsWith('http://') || tmp1.startsWith('https://'))) {
tmp = tmp.slice(4);
continue;
}
let data = tmp;
if (data.length <= 0) continue;
let char = [
' ', '|',
']', '}', '<',
'[', '{', '>',
'\r', '\n'
];
end = data.length;
while (data.exist(char)) {
for (let i of char) {
end = data.indexOf(i);
if (end !== -1) break;
}
data = data.slice(0, end);
}
if (end === -1) continue;
tmp = tmp.slice(end);
end = data.length;
for (let i in data) {
let ii = Number(i);
if (!Number.isSafeInteger(ii)) continue;
if (data.charCodeAt(ii) > 255) { // end with first non-ascii char.
end = ii;
break;
}
}
data = data.slice(0, end);
data = data.split('#')[0];
try {
data = (new URL(data));
} catch (e) { continue; }
let tmp2 = data.host.split('');
let tmp3 = tmp2.pop();
if (tmp3 !== '.') tmp2.push(tmp3);
data.host = tmp2.join('');
if (list.has(data.host)) continue;
url.add(data.href);
}
return [...url];
}
const getPageLink = (value) => {
let list = [
'Special', '特殊',
'Media', '媒体', '媒体文件', '媒體',
'Talk', '对话', '對話', '討論', '讨论',
'MediaWiki', 'Mediawiki_Talk', 'MediaWiki討論', 'MediaWiki讨论',
'Wikipedia', 'Wikipedia_Talk', 'WP', 'WT', 'Project', '維基百科', '维基百科', '維基百科對話', '維基百科討論', '维基百科对话', '维基百科讨论',
'User', 'User_Talk', 'U', 'UT', '使用者', '用戶', '用户',
'Help', 'Help_Talk', 'H', '使用說明', '帮助', '幫助', '使用說明討論', '帮助对话', '帮助讨论', '幫助對話', '幫助討論',
'File_Talk', 'FT', 'Image_Talk', '图像对话', '图像讨论', '圖像對話', '圖像討論', '文件对话', '文件對話', '文件討論', '文件讨论', '档案对话', '档案讨论', '檔案對話', '檔案討論',
'Template', 'T', '样板', '模板', '樣板', '样板对话', '样板讨论', '模板对话', '模板對話', '模板討論', '模板讨论', '樣板對話', '樣板討論',
'Category', 'CAT', '分类', '分類', '分类对话', '分类讨论', '分類對話', '分類討論',
'主題討論', '主题讨论',
'Draft', 'Draft_Talk', '草稿', '草稿討論', '草稿讨论',
'Module', '模块', '模組', '模組討論',
'Topic', '話題', '话题'
];
for (let i in list) {
let ii = Number(i);
if (!Number.isSafeInteger(ii)) continue;
list[ii] = list[ii]
.toLowerCase()
.replaceAll(' ', '_')
.replaceAll(':', '');
}
list = (new Set(list));
let v = String(value);
let link = (new Set());
for (
let start = null, end = null, tmp = v.slice(start = v.indexOf('[['));
start > 0 && tmp.length > 0;
tmp = tmp.slice(start = tmp.indexOf('[['))
) {
let data = tmp.slice(2);
if ((end = data.indexOf(']]')) === -1) break;
data = data.slice(0, end);
if (data.length <= 0) continue;
tmp = tmp.slice(2 + end);
data = data.replaceAll('{{!}}', '|').split('|')[0];
data = data.split('');
if (data[0] === ' ') data.shift();
if (data.reverse()[0] === ' ') data.shift();
data = data.reverse().join('');
if (data[0] === '#' || data[0] === ':' || data.exist(['{', '}', '[', ']'])) continue;
data = data.replaceAll(' ', '_').split('#')[0];
if (data.exist(':') && list.has(data.split(':')[0].toLowerCase())) continue;
link.add(data);
}
return [...link];
};
const saveStatus = {
count: {
doing: 0,
done: 0
},
code: {},
time: [],
display: function () {
let f = '-'.repeat(30);
let tl = 28;
let count = { ...(this.count) };
let code = { ...(this.code) };
for (let i of Object.keys(code))
code[i] = `${(new Date(code[i][1])).toISOString()}|${code[i][0]}`;
let time = [...(this.time)];
if (time.length > tl) time = time.slice(-tl);
for (let i = 0; i < time.length; ++i) time[i] /= 1000;
console.log(f);
console.log('NOW:', (new Date()).toISOString());
console.log('UPTIME:', String(date(process.uptime())));
console.log('COUNT:', count);
console.log('CODE:', code);
console.log('TIME:', time, 'AVERAGE:', this.time.avg() / 1000);
console.log('SAVE-SIZE:', saveList.size);
console.log('GEN-SIZE:', generatePagePool.size);
console.log('ED-SIZE:', ED.size);
console.log(f);
}
};
const save = {
archive_org: (url, callback) => {
let to = () => {
let surl = 'https://web.archive.org/save';
request({
method: 'POST',
url: surl,
head: {
'content-type': 'application/x-www-form-urlencoded',
'referer': surl
},
body: querystring.stringify({ url, capture_all: 'on' }),
size: 0,
redirect: false,
callback: (info) => {
if (info.code[0] === -3) {
setTimeout(to, 0);
return;
}
if (callback) callback(url, info);
}
});
};
to();
},
archive_today: (url, callback) => {
let to = () => {
request({
method: 'GET',
url: 'https://archive.md/',
head: {},
body: '',
size: -1,
redirect: false,
callback: (iinfo) => {
if (iinfo.code[0] === -3) {
setTimeout(to, 0);
return;
}
let rv = iinfo.body;
let v = String(rv);
v = v.slice(v.indexOfL('form'));
v = v.slice(v.indexOfL('submitid'));
v = v.slice(v.indexOfL('value') + 5);
v = v.slice(0, v.indexOf('/>'));
let id = v.replaceAll('=', '').replaceAll('"', '').replaceAll('\'', '');
console.log('TODAY-SUBID:', id);//...
if (!id) {
saveList.add(url);
return;
}
let to2 = () => {
request({
method: 'POST',
url: 'https://archive.is/submit/',
head: {
'content-type': 'application/x-www-form-urlencoded'
},
body: querystring.stringify({ submitid: id, url }),
size: -1,
redirect: false,
callback: (info) => {
if (info.code[0] === -3) {
setTimeout(to2, 0);
return;
}
if ((!info.head['refresh']) && (!info.head['location'])) {
reply.destroy();
info.code = [-429, 'Captcha[archive.today]'];
delete info.body;
console.log(info, decodeURI(url));
saveList.add(url);
return;
} else {
let tmp = null;
if (info.head['refresh']) {
tmp = info.head['refresh'];
tmp = tmp.slice(tmp.indexOfL('url=') + 4).split(';')[0];
} else {
tmp = info.head['location'];
}
request({
method: 'GET',
url: tmp,
head: {},
body: '',
size: -1,
redirect: true,
callback: () => { }
});
ED.add(url);
ED.add(tmp);
}
if (callback) callback(url, info);
}
});
};
to2();
}
});
};
to();
}
};
const saveList = (new Set([
TITLE_PREFIX + 'Wikipedia:首页',
TITLE_PREFIX + 'Portal:特色內容',
TITLE_PREFIX + 'Template:Dyk',
TITLE_PREFIX + 'Wikipedia:新条目推荐/候选',
TITLE_PREFIX + 'Portal:新聞動態',
TITLE_PREFIX + 'Category:新闻动态',
TITLE_PREFIX + 'Wikipedia:典范条目',
TITLE_PREFIX + 'Wikipedia:典范条目/存档',
TITLE_PREFIX + 'Category:典范条目',
TITLE_PREFIX + 'Wikipedia:典范条目评选',
TITLE_PREFIX + 'Wikipedia:已撤销的典范条目',
TITLE_PREFIX + 'Wikipedia:优良条目',
TITLE_PREFIX + 'Wikipedia:优良条目/存档',
TITLE_PREFIX + 'Category:優良條目',
TITLE_PREFIX + 'Wikipedia:優良條目評選',
TITLE_PREFIX + 'Wikipedia:已撤消的優良條目',
TITLE_PREFIX + 'Wikipedia:特色图片',
TITLE_PREFIX + 'Category:特色图片',
TITLE_PREFIX + 'Wikipedia:特色圖片評選',
TITLE_PREFIX + 'Wikipedia:已撤销的特色图片',
TITLE_PREFIX + 'Wikipedia:每日图片',
'https://commons.wikimedia.org/wiki/Commons:Picture_of_the_day',
TITLE_PREFIX + 'Wikipedia:特色列表',
TITLE_PREFIX + 'Category:特色列表',
TITLE_PREFIX + 'Wikipedia:特色列表评选',
TITLE_PREFIX + 'Wikipedia:已撤销的特色列表',
TITLE_PREFIX + 'Template:AdvancedSiteNotices',
TITLE_PREFIX + 'Template:Bulletin',
TITLE_PREFIX + 'Wikipedia:公告欄/存檔',
TITLE_PREFIX + 'Wikipedia:互助客栈',
TITLE_PREFIX + 'Wikipedia:互助客栈_(全部)',
TITLE_PREFIX + 'Wikipedia:互助客栈/消息',
TITLE_PREFIX + 'Wikipedia:互助客栈/方针',
TITLE_PREFIX + 'Wikipedia:互助客栈/技术',
TITLE_PREFIX + 'Wikipedia:互助客栈/求助',
TITLE_PREFIX + 'Wikipedia:互助客栈/条目探讨',
TITLE_PREFIX + 'Wikipedia:互助客栈/其他',
TITLE_PREFIX + 'Wikipedia:知识问答',
TITLE_PREFIX + 'Wikipedia:Guestbook_for_non-Chinese_speakers',
TITLE_PREFIX + 'Wikipedia_talk:Guestbook_for_non-Chinese_speakers'
]));
{
let default_interval = 5 * 1000;
let wait429 = 10 * 1000;
let default_left = 15;
let left = default_left;
let range = 60 * 1000;
setInterval(() => { left = default_left }, range);
let toSave = () => {
if (saveList.size <= 0) return;
let after429 = saveStatus.code[429];
after429 = (after429 ? after429[1] : null);
if (after429 && (Date.now() - after429) < wait429) return;
if (left <= 0 || (!left)) return;
let url = saveList.shift();
if (ED.has(url) && (!url.startsWith(TITLE_PREFIX))) return;
let z = (furl, info) => {
let t2 = Date.now();
if (t1 > 0) saveStatus.time.push(t2 - t1);
saveStatus.count.done += 1;
left -= info.redirect;
let l = info.code[0];
let arr = saveStatus.code[l];
if (arr) {
arr[0] += 1;
arr[1] = t2;
} else {
saveStatus.code[l] = [1, t2];
}
delete info.body;
try {
console.log(info, decodeURI(furl));
} catch (e) {
console.log(info, furl);
}
saveStatus.display();
switch (info.code[0]) {
case 200: {
ED.add(furl);
ED.add(url);
break;
}
case 403: {
saveStatus.count.doing += 1;
t1 = -1;
save.archive_today(url, z);
break;
}
default: {
saveList.add(url);
break;
}
}
};
let t1 = null;
request({
method: 'GET',
url,
head: {},
body: '',
size: 0,
redirect: true,
callback: (info) => {
console.log(info);
let t = Number(String(info.code[0])[0]);
if (info.code[0] <= 0 || t >= 4 || (!t)) return;
if ((left--) <= 0 || (!left)) {
saveList.add(url);
return;
}
//saveStatus.count.doing += Object.keys(save).length;
saveStatus.count.doing += 1;
t1 = Date.now();
save.archive_org(url, z);
}
});
};
let er = () => {
let interval = (() => {
let z = default_interval;
/*
if (saveStatus.time.length > 10) {
let t = saveStatus.time.avg()
//z = t - (t / 4)
z = t / 2
}
*/
z = Math.abs(z);
return z + Math.floor((Math.floor(Math.random() * 10) % 2 ? Math.random() : -Math.random()) * 1000);
})();
console.log({ INTERVAL: interval / 1000, LEFT: left });
toSave();
setTimeout(er, interval);
};
er();
}
const main = (sec = INTERVAL_SECOND, count = COUNT) => {
let msec = sec * 1000;
let di = 60 * 1000;
let tmp2 = () => void saveStatus.display();
setInterval(tmp2, di);
setTimeout(tmp2, 0);
let ing = 0;
let tmp = () => {
if (saveList.size > 100) {
if (ing) return;
ing = 1;
setTimeout(() => {
ing = 0;
tmp();
}, 1000);
return;
}
getPage(null, (furl, data) => {
let tmp = (new URL(furl));
tmp.search = '';
furl = tmp.href;
try {
console.log(decodeURI(furl));
} catch (e) {
console.log(furl);
}
saveList.add(furl);
if (!data) return;
let link = getPageLink(data);
if (link.length > 0) {
for (let i of link) {
console.log('|-', `[[${i}]]`);
generatePagePool.add(TITLE_PREFIX + i);
}
}
console.log('|');
let url = getPageURL(data);
if (url.length <= 0) {
console.log('<NOT FOUND URL>');
} else {
for (let i of url) {
console.log('|-', i);
saveList.add(i);
}
}
saveStatus.display();
});
};
for (let i = 0; i < count; ++i) {
setInterval(tmp, msec);
setTimeout(tmp, 0);
}
};
/* the content of file "package.json":
{
"engines": {
"node": ">=13.0.0"
},
"scripts": {
"start": "node archiver.js"
}
}
*/
const hosting = (port, site) => {
http.createServer((req, res) => {
if (req.url !== '/') {
res.end();
return;
}
res.writeHead(200, { 'content-type': 'text/plain; charset=utf-8' });
res.end(JSON.stringify({
NOW: (new Date()).toISOString(),
UPTIME: String(date(process.uptime())),
USER_AGENT,
SITE_PREFIX,
TITLE_PREFIX,
PAGE_RANDOM,
CAT_RANDOM,
GENERATE_RANDOM,
INTERVAL_SECOND,
COUNT,
saveStatus,
saveList: { size: saveList.size, data: [...saveList] },
generatePagePool: { size: generatePagePool.size, data: [...generatePagePool] },
ED: { size: ED.size, data: [...ED] }
}, null, 2));
}).listen(port || process.env.PORT);
if (site) {
site = (new URL(site)).origin;
setInterval(() => {
https.get(`${site}/${Math.random()}`, (reply) => void reply.destroy());
}, 60 * 1000);
}
};
const heroku = (name) => hosting(null, `https://${name}.herokuapp.com`);
const glitch = (name) => hosting(null, `https://${name}.glitch.me`);
main();
//heroku('[INPUT YOUR HEROKU APP NAME]');
//glitch('[INPUT YOUR GLITCH APP NAME]');