User:Amgine/Dump processing
Appearance
test_xml
[edit]A quick hack to build some word lists for User:Nemo_bis. Two quick php scripts, the first rips a collection of files (above 1300 with the 20140311 dump), named after their L2 header text containing all words which had that L2 header (./[L2 Header].txt). The second maps these files to wikipedia language codes (copies the list to ./wpdict/[wpCode].txt).
test_xml.php
[edit]<?php
/*
* test_xml.php
*
* Copyright 2014 Amgine <amgine@Amgines-MacBook-Air.local>
*
* This program is free software. It comes without any warranty, to the extent
* permitted by applicable law. You can redistribute it and/or modify it under
* the terms of the Do What The Fuck You Want To Public License, Version 2, as
* published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
*
*/
/**
* function: getL2
**
* Return an array of L2 headers from a wikitext string.
**
* @param string $text The wiki syntax string of the page.
* @return array An array of language names (or other level 2 header texts.)
**/
function getL2( $text ){
$l2 = array();
foreach( explode( "\n", $text ) as $line ){
$matches = array();
if( preg_match_all( '/^==([\w\s\-‒-—‑¯_]+)==/u', $line, $matches ) ){
$l2[] = trim( $matches[1][0] );
}
}
if( count( $l2 ) > 0 ){
return $l2;
}
}
/**
* function: add2Dictionary
**
* Process found term (breaking up phrases), check for uniqueness, and
* add to the dictionary.
**
* @param string $term The found term to be added to the dictionary.
* @param array $dictionary The dictionary which will be appended.
**/
function add2Dictionary( $term, &$dictionary, $lang, $whitespace = false ){
if( $whitespace ){
if( !in_array( trim( $term ), $dictionary ) ){
$dictionary[] = trim( $term );
}
}else{
if( preg_match( '/[\w]+/', $term ) ){
$term = explode( ' ', $term );
foreach( $term as $word ){
if( !in_array( trim( $word ), $dictionary ) ){
$dictionary[] = trim( $word );
}
}
}
}
// Buffer 500 entries in ram per language; a reasonable performance vs. memory explode
if( count( $dictionary ) > 500 ){
if( !file_put_contents( $lang . '.txt', implode( "\n", $dictionary ), FILE_APPEND ) ){
die( "Writing $lang dictionary -- FAILED!\n" );
}
$dictionary = array();
}
}
$reader = new XMLReader();
// @FIXME: hard-coded dump filename
if( !$reader->open( '/Volumes/VERBATIM HD/enwiktionary-20140311-pages-meta-current.xml' ) ){
die( "Failed to open file for xml reading.\n" );
}
// @FIXME: evil, vile, but it works
$reader->read(); $reader->read();
$reader->next( 'page' );
$dictionary = array();
# Try Unicode magic to check for: MIXED_SCRIPT_CONFUSABLE, SINGLE_SCRIPT, INVISIBLE
# https://ssl.icu-project.org/apiref/icu4c/uspoof_8h.html#a0dbd60e53a571689baf65c63f4de8155
# @FIXME: Before this, remove words not in the main script of the language (requires name->code)
# @FIXME: Currently stripping all non-Latin!
$checker = new Spoofchecker();
$checker->setChecks ( 50 );
$i = 0;
do{
// Grab <page> node, make simple xml object of it.
$node = $reader->readOuterXML();
$element = simplexml_load_string( $node );
if( $element->ns == '0' && !$checker->isSuspicious( $element->title ) ){
// If in main namespace, and not confusable, grab L2 headers
// @FIXME: Create a smarter exclusionary system
// @TODO: Figure out what the consensus is on proper names
if( !preg_match( '/\{\{context\|[^}]*\b(vulgar(?:ity?)\b|obscen(?:e|ity)|offensive|pejorative|medicine|slang)[^}]*\}\}/iu', $element->revision->text ) ){
$l2s = getL2( $element->revision->text );
if( is_array( $l2s ) ){
foreach( $l2s as $lang ){
if( !array_key_exists( $lang, $dictionary ) ){
$dictionary[$lang] = array();
}
// NB: Optional $whitespace = true for function allows multi-word phrases,
// default does not. '''Default may create broken CJK phrases'''
add2Dictionary( $element->title, $dictionary[$lang], $lang );
}
}
} else {
add2Dictionary( $element->title, $dictionary['blacklist'], 'blacklist', $whitespace = true );
}
}
++$i;
echo "\r$i";
}while( $reader->next( 'page' ) );
// File is parsed, flush buffered dictionaries
echo "\n\n";
foreach( $dictionary as $lang => $dict ){
echo "Writing $lang dictionary.";
if( file_put_contents( $lang . '.txt', implode( "\n", $dictionary[$lang] ), FILE_APPEND ) ){
echo "\rWrote $lang dictionary successfully.\n";
}else{
echo " -- FAILED!\n";
}
}
// @FIXME: move langname dictionaries to langcode dictionaries.s
test_dict2wpMapper.php
[edit]<?php
/*
* test_dict2wpMapper.php
*
* Copyright 2014 Amgine <amgine@Amgines-MacBook-Air.local>
*
* This program is free software. It comes without any warranty, to the extent
* permitted by applicable law. You can redistribute it and/or modify it under
* the terms of the Do What The Fuck You Want To Public License, Version 2, as
* published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
*
*/
$lang = array(
// Germanics
'English' => array( 'same', 'en', 'simple' ),
'Dutch' => 'nl',
'German' => 'de',
'Swedish' => 'sv',
'Norwegian Bokmål' => 'no',
'Norwegian Nynorsk' => 'nn',
'Danish' => 'da',
'Luxembourgish' => 'lb',
'Icelandic' => 'is',
'Afrikaans' => 'af',
'West Frisian' => 'fy',
'Low German' => 'nds',
'Scots' => 'sco',
'Alemannic German' => 'als',
'Yiddish' => 'yi',
'Limburgish' => 'li',
'Bavarian' => 'bar',
'Faroese' => 'fo',
'Dutch Low Saxon' => 'nds-nl',
'West Flemish' => 'vls',
'North Frisian' => 'frr',
'Saterland Frisian' => 'stq',
'Kölsch' => 'ksh',
'Old English' => 'ang',
'Pennsylvania German' => 'pdc',
'Gothic' => 'got',
// Italic
'French' => 'fr',
'Italian' => 'it',
'Spanish' => 'es',
'Portuguese' => 'pt',
'Catalan' => 'ca',
'Romanian' => 'ro',
'Galician' => 'gl',
'Latin' => 'la',
'Occitan' => 'oc',
'Piedmontese' => 'pms',
'Haitian Creole' => 'ht',
'Aragonese' => 'an',
'Lombard' => 'lmo',
'Sicilian' => 'scn',
'Asturian' => 'ast',
'Neapolitan' => 'nap',
'Walloon' => 'wa',
'Venetian' => 'vec',
'Tarantino' => 'roa-tara',
'Corsican' => 'co',
'Romansch' => 'rm',
'Ladino' => 'lad',
'Friulian' => 'fur',
'Ligurian' => 'lij',
'Sardinian' => 'sc',
'Franco-Provençal' => 'frp',
'Extremaduran' => 'ext',
'Picard' => 'pcd',
// NB: Emilian => egl
'Emilian' => 'eml',
'Papiamentu' => 'pap',
'Mirandese' => 'mwl',
'Aromanian' => 'roa-rup',
// Slavic
'Russian' => 'ru',
'Polish' => 'pl',
'Ukrainian' => 'uk',
'Czech' => 'cs',
//'Serbo-Croatian' => array( 'same', 'sr', 'hr', 'sh', 'bs' ), avoid flames
'Slovak' => 'sk',
'Bulgarian' => 'bg',
'Slovene' => 'sl',
'Macedonian' => 'mk',
'Belarusian' => array( 'same', 'be', 'be-x-old' ),
'Upper Sorbian' => 'hsb',
'Rusyn' => 'rue',
'Kashubian' => 'csb',
'Silesian' => 'szl',
'Lower Sorbian' => 'dsb',
'Old Church Slavonic' => 'cu',
// Philippine
'Waray-Waray' => 'war',
'Cebuano' => 'ceb',
'Tagalog' => 'tl',
'Kapampangan' => 'pam',
'Ilocano' => 'ilo',
'Bikol Central' => 'bcl',
'Pangasinan' => 'pag',
// Japanic
'Japanese' => 'ja',
// Austroasiatic
'Vietnamese' => 'vi',
'Khmer' => 'km',
// Turkic
'Turkish' => 'tr',
'Kazakh' => 'kk',
'Uzbek' => 'uz',
'Azeri' => 'az',
'Tatar' => 'tt',
'Bashkir' => 'ba',
'Kyrgyz' => 'ky',
'Chuvash' => 'cv',
'Yakut' => 'sah',
'Turkmen' => 'tk',
'Uyghur' => 'ug',
'Gagauz' => 'gag',
'Karachay-Balkar' => 'krc',
'Crimean Tatar' => 'crh',
'Karakalpak' => 'kaa',
'Tuvan' => 'tyv',
// Sinitic
'Cantonese' => 'zh-yue',
'Min Nan' => 'zh-min-nan',
'Gan' => 'gan',
'Wu' => 'wuu',
'Hakka' => 'hak',
'Min Dong' => 'cdo',
// NB: These may require additional parser logic, not processing now.
// zh • 758,009 – Chinese (中文)
// zh-classical • 3,245 – Classical Chinese (文言)
// Sunda–Sulawesi
'Malay' => 'ms',
'Indonesian' => 'id',
'Javanese' => 'jv',
'Sundanese' => 'su',
'Buginese' => 'bug',
'Banyumasan' => 'map=bms',
'Minangkabau' => 'min',
'Acehnese' => 'ace',
'Banjarese' => 'bjn',
'Chamorro' => 'ch',
// Finno-Permic
'Finnish' => 'fi',
'Estonian' => 'et',
'Northern Sami' => 'se',
'Western Mari' => 'mrj',
'Võro' => 'fiu-vro',
'Komi-Zyrian' => 'kv',
'Komi-Permyak' => 'koi',
'Udmurt' => 'udm',
// NB: Eastern Mari => chm
'Eastern Mari' => 'mhr',
'Veps' => 'vep',
'Erzya' => 'myv',
'Moksha' => 'mdf',
// Semitic
'Arabic' => 'ar',
'Hebrew' => 'he',
'Amharic' => 'am',
'Egyptian Arabic' => 'arz',
'Maltese' => 'mt',
'Aramaic' => 'arc',
'Tigrinya' => 'ti',
// Iranian
'Persian' => 'fa',
'Tajik' => 'tg',
'Kurdish' => 'ku',
'Central Kurdish' => 'ku',
'Mazanderani' => 'mzn',
'Ossetian' => 'os',
'Gilaki' => 'glk',
'Pashto' => 'ps',
'Zazaki' => 'diq',
// Indo-Aryan
'Hindi' => 'hi',
'Marathi' => 'mr',
'Western Panjabi' => 'pnb',
'Bengali' => 'bn',
'Bishnupriya Manipuri' => 'bpy',
'Urdu' => 'ur',
'Nepali' => 'ne',
'Gujarati' => 'gu',
'Fiji Hindi' => 'hif',
'Sanskrit' => 'sa',
'Sinhalese' => 'si',
'Punjabi' => 'pa',
'Oriya' => 'or',
'Dhivehi' => 'dv',
'Pali' => 'pi',
'Bihari' => 'bh',
'Assamese' => 'as',
'Sindhi' => 'sd',
'Kashmiri' => 'ks',
// Constructed
'Esperanto' => 'eo',
'Volapük' => 'vo',
'Ido' => 'io',
'Interlingua' => 'ia',
'Interlingue' => 'ie',
'Novial' => 'nov',
// Ugric
'Hungarian' => 'hu',
// Korean
'Korean' => 'ko',
// Baltic
'Lithuanian' => 'lt',
'Latvian' => 'lv',
//NB: Samogitian => sgs
'Samogitian' => 'bat-smg',
'Latgalian' => 'ltg',
// Basque
'Basque' => 'eu',
// Dravidian
'Tamil' => 'ta',
'Telugu' => 'te',
'Malayalam' => 'ml',
'Kannada' => 'kn',
// Celtic
'Breton' => 'br',
'Welsh' => 'cy',
'Irish' => 'ga',
'Scottish Gaelic' => 'gd',
'Manx' => 'gv',
'Cornish' => 'kw',
// Tibeto-Burman
'Newari' => 'new',
'Burmese' => 'my',
'Tibetan' => 'bo',
'Dzongkha' => 'dz',
// Tai
'Thai' => 'th',
'Lao' => 'lo',
'Zhuang' => 'za',
// Hellenic
'Greek' => 'el',
'Pontic Greek' => 'pnt',
// Kartvelian
'Georgian' => 'ka',
'Mingrelian' => 'xmf',
// Albanian
'Albanian' => 'sq',
// Bornean
'Malagasy' => 'mg',
// Bantoidsw • 26,073 – Swahili (Kiswahili)
'Lingala' => 'ln',
'Kinyarwanda' => 'rw',
'Shona' => 'sn',
'Kongo' => 'kg',
'Northern Sotho' => 'nso',
'Zulu' => 'zu',
'Tswana' => 'tn',
'Swazi' => 'ss',
'Tsonga' => 'ts',
'Kikuyu' => 'ki',
'Venda' => 've',
'Kirundi' => 'rn',
'Luganda' => 'lg',
'Tumbuka' => 'tum',
'Sotho' => 'st',
'Xhosa' => 'xh',
'Chichewa' => 'ny',
// Yoruboid
'Yoruba' => 'yo',
// Armenian
'Armenian' => 'hy',
// Quechuan
'Quechua' => 'qu',
// Polynesian
'Maori' => 'mi',
'Hawaiian' => 'haw',
'Tongan' => 'to',
'Tahitian' => 'ty',
'Samoan' => 'sm',
// Mongolic
'Mongolian' => 'mn',
'Kalmyk' => 'xal',
// Uto-Aztecan
'Nahuatl' => 'nah',
// Northeast Caucasian
'Chechen' => 'ce',
'Lezgi' => 'lez',
'Lak' => 'lbe',
'Avar' => 'av',
// Germanic Pidgins & Creoles
'Tok Pisin' => 'tpi',
'Sranan Tongo' => 'srn',
'Bislama' => 'bi',
'Pitcairn-Norfolk' => 'pih',
// Cushitic
'Somali' => 'so',
'Oromo' => 'om',
'Afar' => 'aa',
// Athabaskan
'Navajo' => 'nv',
// Eskimo-Aleut
'Greenlandic' => 'kl',
'Inuktitut' => 'iu',
'Inupiak' => 'ik',
// Aymaran
'Aymara' => 'ay',
// Tupian
'Guaraní' => 'gn',
// Northwest Caucasian
'Kabardian' => 'kbd',
'Abkhaz' => 'ab',
// Atlantic
'Wolof' => 'wo',
'Fula' => 'ff',
// A priori Constructed
'Lojban' => 'jbo',
// Berber
'Kabyle' => 'kab',
// Micronesian
'Nauruan' => 'na',
'Marshallese' => 'mh',
// Igboid
'Igbo' => 'ig',
// Timor-Babar
'Tetum' => 'tet',
// Algonquian
'Cheyenne' => 'chy',
'Cree' => 'cr',
// Kwa
'Ewe' => 'ee',
'Akan' => 'ak',
'Twi' => 'tw',
// Iroquoian
'Cherokee' => 'chr',
// Mande
'Bambara' => 'bm',
// Chadic
'Hausa' => 'ha',
// Ubangian
'Sango' => 'sg',
// East Fijian
'Fijian' => 'fj',
);
echo count( $lang );
foreach( $lang as $langname => $val ){
echo "\n$langname";
if( is_array( $val ) ){
foreach( $val as $key => $langcode ){
if( $key > 0 ){
if( $val[0] == 'same' ){
if( file_exists( "$langname.txt" ) ){
if( !copy( "$langname.txt", "wpdict/$langcode.txt" ) ){
die( "FAILED -- copying $langname.txt to wpdict/$val.txt\n" );
}
}else{
echo "\n$langname.txt DOES NOT EXIST!\n";
}
}//elseif( $val[0] == 'merge' ){ do the merge here }
}
}
}else{
// Just copy the file over.
if( file_exists( "$langname.txt" ) ){
if( !copy( "$langname.txt", "wpdict/$val.txt" ) ){
die( "FAILED -- copying $langname.txt to wpdict/$val.txt\n" );
}
}else{
echo "\n$langname.txt DOES NOT EXIST!\n";
}
}
}
echo "\n\nDone.";
testFiltr.php
[edit]#!/usr/bin/php
<?php
/*
* testFiltr.php
*
* Copyright 2014 Amgine <amgine@Amgines-MacBook-Air.local>
*
* This program is free software. It comes without any warranty, to the extent
* permitted by applicable law. You can redistribute it and/or modify it under
* the terms of the Do What The Fuck You Want To Public License, Version 2, as
* published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
*
*/
/**
* REQUIREMENTS: It is expected this will be run in a *nix environment, plus
* * cURL
* * bzip2
*
* If your php is not located at /usr/bin/php, better fix the shebang.
**
* USE NOTE: This file must be marked as executable, chmod +x testFiltr.php
*
* example:
* curl http://dumps.wikimedia.org/enwiktionary/20140328/enwiktionary-20140328-pages-articles.xml.bz2 | bzcat | ./testFiltr.php
**/
/**
* function: getL2
**
* Return an array of L2 headers from a wikitax string.
**
* @param string $text The wiki syntax string of the page.
* @return array An array of language names (or other level 2 header texts.)
**/
function getL2( $text ){
$l2 = array();
foreach( explode( "\n", $text ) as $line ){
$matches = array();
if( preg_match_all( '/^==([\w\s\-‒-—‑¯_]+)==/u', $line, $matches ) ){
$l2[] = trim( $matches[1][0] );
}
}
if( count( $l2 ) > 0 ){
return $l2;
}
return array();
}
/**
* function: add2Dictionary
**
* Process found term (breaking up phrases), check for uniqueness, and
* add to the dictionary.
**
* @param string $term The found term to be added to the dictionary.
* @param array $dictionary The dictionary which will be appended.
**/
function add2Dictionary( $term, &$dictionary, $whitespace = false ){
if( $whitespace ){
if( !in_array( trim( $word ), $dictionary ) ){
$dictionary[] = trim( $word );
}
}else{
if( preg_match( '/[\w]+/', $term ) ){
$term = explode( ' ', $term );
foreach( $term as $word ){
if( !in_array( trim( $word ), $dictionary ) ){
$dictionary[] = trim( $word );
}
}
}
}
}
// http://dumps.wikimedia.org/mhwiktionary/20140401/mhwiktionary-20140401-pages-articles.xml.bz2
//$fh = fopen( '/Users/amgine/Downloads/mhwiktionary-20140401-pages-articles.xml', 'r' );
$i = $j = 0;
$dictionary = array();
// This is the directory the dictionaries will be stored in. It must exist or script will fail.
$destPath = 'enWT/';
do{
$buffer = stream_get_line( STDIN, 16384, PHP_EOL );
++$j;
if( preg_match( '/^<page>$/i', trim( $buffer ) ) ){
$node = $buffer;
do{
$buffer = stream_get_line( STDIN, 16384, PHP_EOL );
$node .= "\n$buffer";
++$j;
}while( !feof( STDIN ) && !preg_match( '/^<\/page>$/i', trim( $buffer ) ) );
++$i;
$element = simplexml_load_string( $node );
if( $element->ns == '0' ){
// If in main namespace, grab L2 headers
if( !preg_match( '/\{\{context\|[^}]*(\bvulgar(?:ity?)\b|\bobscen(?:e|ity))[^}]*\}\}/iu', $element->revision->text ) ){
$l2s = getL2( $element->revision->text );
if( is_array( $l2s ) ){
foreach( $l2s as $lang ){
if( !array_key_exists( $lang, $dictionary ) ){
$dictionary[$lang] = array();
}
// NB: Optional $whitespace = true for function allows multi-word phrases,
// default does not. '''Default may create broken CJK phrases'''
add2Dictionary( $element->title, $dictionary[$lang] );
// Buffer 500 entries in ram per language; a reasonable performance vs. memory explode
if( count( $dictionary[$lang] ) > 500 ){
if( !file_put_contents( $destPat . $lang . '.txt', implode( "\n", $dictionary[$lang] ), FILE_APPEND ) ){
die( "Writing $lang dictionary -- FAILED!\n" );
}
$dictionary[$lang] = array();
}
}
}
}
}
}
}while( !feof( STDIN ) );
echo "\r$j lines processed.\n$i pages found.\n";
// File is parsed, flush buffered dictionaries
echo "\n\n";
foreach( $dictionary as $lang => $dict ){
echo "Writing $lang dictionary.";
if( file_put_contents( $destPath . $lang . '.txt', implode( "\n", $dictionary[$lang] ), FILE_APPEND ) ){
echo "\rWrote $lang dictionary successfully.\n";
}else{
echo " -- FAILED!\n";
}
}