Parsoid/Language conversion/Preprocessor fixups/munge
Appearance
#!/usr/bin/node /* This is the script that post-processes the results of `dumpgrepper` into wikitext for posting on mediawiki.org. It helps if you use the `git-mediawiki` package (https://github.com/Git-Mediawiki/Git-Mediawiki/wiki) to sync the results back onto mediawiki.org. Usage: 1. Put the output of `dumpgrepper` into a subdirectory named `results-$DUMPDATE/`. 2. Check out the pages from mediawiki.org using: git clone -c remote.origin.categories='Parsoid' -c remote.origin.mwLogin=[your mediawiki username] -c remote.origin.mwPassword=[your mediawiki password] mediawiki::https://www.mediawiki.org/w 3. Run: ./munge.js $DUMPDATE 4. Change to `w/`: git add . && git commit && git push */ var fs = require('fs'); var path = require('path'); var https = require('https'); //var wikis = "cebwiki dewiki enwiki eswiki frwiki itwiki jawiki mediawikiwiki nlwiki plwiki ptwiki ruwiki svwiki viwiki warwiki zhwiki".split(/\s+/g); var GIT_WP = true; var UNIFIED_LIST = true; var TITLES_ONLY = true; var PAGE_PREFIX = "Parsoid%2FLanguage_conversion%2FPreprocessor_fixups%2F"; var DUMPDATE = process.argv[2] || process.env.DUMPDATE || '20170620'; var unifiedOutput = { wp: {counts:'', chem:'',urls:'',nonarticle:'',other:''}, sister: {counts:'', chem:'',urls:'',nonarticle:'',other:''} }; if (GIT_WP) { // Copy this script itself to the wiki var outFile = path.join(__dirname, "w/" + PAGE_PREFIX + "munge.mw"); var self = fs.readFileSync(__filename, "utf8"); self = "<pre><nowiki>\n" + self.replace(/&/g, '&').replace(/</g, '<') + "\n</nowiki></pre>\n" + "[[Category:Parsoid]]\n"; fs.writeFileSync(outFile, self, "utf8"); } var jsonRequest = function(url) { return new Promise(function(resolve, reject) { https.get(url, function(res) { var statusCode = res.statusCode; var contentType = res.headers['content-type']; var error; if (statusCode !== 200) { error = new Error("Request Failed "+statusCode+": "+url); } else if (!/^application\/json/.test(contentType)) { error = new Error("Invalid content type: "+contentType); } if (error) { reject(error); res.resume(); return; } res.setEncoding('utf8'); var rawData = ''; res.on('data', function(d) { rawData += d; }); res.on('end', function() { var parsedData; try { parsedData = JSON.parse(rawData); } catch (e) { reject(e); return; } resolve(parsedData); }); }).on('error', function(e) { reject(e); }); }); }; // Fetch list of wikis from siteinfo var siteMatrixP = jsonRequest('https://www.mediawiki.org//w/api.php?action=sitematrix&format=json'); // Get the interwiki map from mediawiki so we know how to link titles. var interWikiP = jsonRequest('https://www.mediawiki.org/w/api.php?action=query&format=json&meta=siteinfo&siprop=interwikimap%7Clanguagevariants'); var reverseMap; var prefixForSite = function(interWikiMap, site) { if (!reverseMap) { reverseMap = new Map(); interWikiMap.forEach(function(iw) { var m = /^(.+)\/wiki\/\$1$/.exec(iw.url); if (m) { var prev = reverseMap.get(m[1]); // Store shortest prefix if (prev && prev.length <= iw.prefix) { return; } reverseMap.set(m[1], iw.prefix); } }); } // Indirect prefixes ("portable" prefixes) var indirect = { wiki: 'w', wiktionary: 'wikt', wikibooks: 'b', wikinews: 'n', wikiquote: 'q', wikisource: 's', wikiversity: 'v', wikivoyage: 'voy' }; var p = indirect[site.code]; if (p && site.lang) { return p + ':' + site.lang + ':'; } // Direct prefixes on mw: var prefix = reverseMap.get(site.url); if (prefix) { return ':' + prefix + ':'; } // Unknown :( return null; }; var doOneSite = function(site) { var w = site.dbname; var inFile = path.join(__dirname, "results-" + DUMPDATE, w + "-results.txt"); var outFile = GIT_WP ? path.join(__dirname, "w/" + PAGE_PREFIX + w + ".mw") : path.join(__dirname, "out/"+w+".wt"); //console.log("Reading", w); var title = null; var nonarticle = "", chem = "", urls = "", math = "", other = ""; var countArticle = 0, countNonarticle = 0, counted = true; var raw = '', missing = false; try { raw = fs.readFileSync(inFile, "utf8"); } catch (e) { missing=true; console.warn("Skipping missing results:", w); } raw.replace(/\n+$/,'').split(/\r\n?|\n/g).slice(1).forEach(function(line) { var m = /^== Match: \[\[(.*)\]\] ==$/.exec(line); if (m) { title = m[1]; counted = false; return; } var item = "# [[" + site.wikiprefix + title+"]]\n"; if (TITLES_ONLY) { if (counted) { return; } } else { item += "#:<code><nowiki>" + line.replace(/<(\/?nowiki)/g, '<$1').split('-{').join('</nowiki><b style="color:red">-<nowiki/>{</b><nowiki>') + "</nowiki></code>\n"; } // Removed matched -{ ... }- markup. line = line.replace(/-\{[^{}]*\}-/g, ''); if (!/-\{/.test(line)) { /* no unmatched markup */ return; } m = /^[^:]+:./.exec(title); if (m) { nonarticle += item; if (!counted) { counted = ++countNonarticle; } return; } // only count each title once if (!counted) { counted = ++countArticle; } m = /IUPAC|OtherNames|Andere Namen/.exec(line); if (m) { chem += item; return; } m = /\[http[^\]\s]*-\{/.exec(line); if (m) { urls += item; return; } m = /<math/.exec(line); if (m) { math += item; return; } other += item; return; }); // Write output file. if (UNIFIED_LIST) { var key = (site.code === 'wiki') ? 'wp' : 'sister'; var links = ''; [['chem',chem],['urls',urls],['other',math+other],['nonarticle',nonarticle]].forEach(function(item) { var fld = item[0], content = item[1]; if (!content) { return; } unifiedOutput[key][item[0]] += "==" + w + "==\n" + content; links += '[[/' + key + '-' + fld + '#' + w + '|'+fld[0]+']] '; }); if (missing) { countArticle = countNonarticle = "(missing)"; } unifiedOutput[key].counts += '|-\n| ' + w + ' || ' + countArticle + ' || ' + countNonarticle + ' || ' + links + '||\n'; return; } var out = "==" + w + "==\n"; out += countArticle + " articles, " + countNonarticle + " other pages.\n"; if (chem) { out += "=== Chemical names ===\n" + chem; } if (urls) { out += "=== Urls ===\n" + urls; } if (math) { out += "=== Math markup ===\n" + math; } if (other) { out += "=== Other ===\n" + other; } if (nonarticle) { out += "=== Matches not in article namespace ===\n" + nonarticle; } if (GIT_WP) { out += '[[Category:Parsoid]]\n'; } if (!missing) { fs.writeFileSync(outFile, out, "utf8"); } }; Promise.all([interWikiP, siteMatrixP]).then(function(arr) { var interWikiMap = arr[0].query.interwikimap; var languageVariants = arr[0].query.languagevariants; var siteMatrix = arr[1].sitematrix; var sites = []; var maybeAddOne = function(site) { if (site.closed !== undefined || site.fishbowl !== undefined || site.private !== undefined) { return; } var prefix = prefixForSite(interWikiMap, site); if (!prefix) { console.warn("Skipping", site.url, "because interwiki prefix unknown."); return; } site.wikiprefix = prefix; sites.push(site); }; var i; for (i=0; siteMatrix[i] !== undefined; i++) { var s = siteMatrix[i]; var lang = s.code; if (languageVariants[lang] !== undefined) { console.warn('Skipping', s.localname, 'because LanguageConverter is in use.'); } else { siteMatrix[i].site.forEach(function(ss) { ss.lang = s.code; maybeAddOne(ss); }); } } siteMatrix.specials.forEach(maybeAddOne); sites.forEach(doOneSite); if (UNIFIED_LIST) { var counts = 'Article counts from the ' + DUMPDATE + ' dump.\n'; ['wp','sister'].forEach(function(key) { if (key === 'wp') { counts += '== Wikipedia ==\n'; } else { counts += '== Sister projects ==\n'; } counts += '{| class="wikitable sortable" style="width:100%"\n' + '|-\n' + '! Wikiproject !! # of titles in main namespace !! # of titles in other namespaces !! Links !! Notes\n' + unifiedOutput[key].counts + '|}\n'; }); if (GIT_WP) { counts += '[[Category:Parsoid]]\n'; } var basename = GIT_WP ? ('w/' + PAGE_PREFIX + DUMPDATE) : 'out/'; var countFile = GIT_WP ? '.mw' : 'counts.wt'; countFile = path.join(__dirname, basename + countFile); fs.writeFileSync(countFile, counts, 'utf8'); ['wp','sister'].forEach(function(key) { ['chem','urls','other','nonarticle'].forEach(function(ty) { var outFile = GIT_WP ? ('%2F' + key + '-' + ty + '.mw') : (key + '-' + ty + '.wt'); outFile = path.join(__dirname, basename + outFile); var data = unifiedOutput[key][ty]; if (GIT_WP) { data += '[[Category:Parsoid]]\n'; } fs.writeFileSync(outFile, data, 'utf8'); }); }); } });