User:Amgine/Dump processing/test xml.php
Appearance
<?php
/*
* test_xml.php
*
* Copyright 2014 Amgine <amgine@Amgines-MacBook-Air.local>
*
* This program is free software. It comes without any warranty, to the extent
* permitted by applicable law. You can redistribute it and/or modify it under
* the terms of the Do What The Fuck You Want To Public License, Version 2, as
* published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
*
*/
/**
* function: getL2
**
* Return an array of L2 headers from a wikitext string.
**
* @param string $text The wiki syntax string of the page.
* @return array An array of language names (or other level 2 header texts.)
**/
function getL2( $text ){
$l2 = array();
foreach( explode( "\n", $text ) as $line ){
$matches = array();
if( preg_match_all( '/^==([\w\s\-‒-—‑¯_]+)==/u', $line, $matches ) ){
$l2[] = trim( $matches[1][0] );
}
}
if( count( $l2 ) > 0 ){
return $l2;
}
}
/**
* function: add2Dictionary
**
* Process found term (breaking up phrases), check for uniqueness, and
* add to the dictionary.
**
* @param string $term The found term to be added to the dictionary.
* @param array $dictionary The dictionary which will be appended.
**/
function add2Dictionary( $term, &$dictionary, $lang, $whitespace = false ){
if( $whitespace ){
if( !in_array( trim( $term ), $dictionary ) ){
$dictionary[] = trim( $term );
}
}else{
if( preg_match( '/[\w]+/', $term ) ){
$term = explode( ' ', $term );
foreach( $term as $word ){
if( !in_array( trim( $word ), $dictionary ) ){
$dictionary[] = trim( $word );
}
}
}
}
// Buffer 500 entries in ram per language; a reasonable performance vs. memory explode
if( count( $dictionary ) > 500 ){
if( !file_put_contents( $lang . '.txt', implode( "\n", $dictionary ), FILE_APPEND ) ){
die( "Writing $lang dictionary -- FAILED!\n" );
}
$dictionary = array();
}
}
$reader = new XMLReader();
// @FIXME: hard-coded dump filename
if( !$reader->open( '/Volumes/VERBATIM HD/enwiktionary-20140311-pages-meta-current.xml' ) ){
die( "Failed to open file for xml reading.\n" );
}
// @FIXME: evil, vile, but it works
$reader->read(); $reader->read();
$reader->next( 'page' );
$dictionary = array();
# Try Unicode magic to check for: MIXED_SCRIPT_CONFUSABLE, SINGLE_SCRIPT, INVISIBLE
# https://ssl.icu-project.org/apiref/icu4c/uspoof_8h.html#a0dbd60e53a571689baf65c63f4de8155
# @FIXME: Before this, remove words not in the main script of the language (requires name->code)
# @FIXME: Currently stripping all non-Latin!
$checker = new Spoofchecker();
$checker->setChecks ( 50 );
$i = 0;
do{
// Grab <page> node, make simple xml object of it.
$node = $reader->readOuterXML();
$element = simplexml_load_string( $node );
if( $element->ns == '0' && !$checker->isSuspicious( $element->title ) ){
// If in main namespace, and not confusable, grab L2 headers
// @FIXME: Create a smarter exclusionary system
// @TODO: Figure out what the consensus is on proper names
if( !preg_match( '/\{\{context\|[^}]*\b(vulgar(?:ity?)\b|obscen(?:e|ity)|offensive|pejorative|medicine|slang)[^}]*\}\}/iu', $element->revision->text ) ){
$l2s = getL2( $element->revision->text );
if( is_array( $l2s ) ){
foreach( $l2s as $lang ){
if( !array_key_exists( $lang, $dictionary ) ){
$dictionary[$lang] = array();
}
// NB: Optional $whitespace = true for function allows multi-word phrases,
// default does not. '''Default may create broken CJK phrases'''
add2Dictionary( $element->title, $dictionary[$lang], $lang );
}
}
} else {
add2Dictionary( $element->title, $dictionary['blacklist'], 'blacklist', $whitespace = true );
}
}
++$i;
echo "\r$i";
}while( $reader->next( 'page' ) );
// File is parsed, flush buffered dictionaries
echo "\n\n";
foreach( $dictionary as $lang => $dict ){
echo "Writing $lang dictionary.";
if( file_put_contents( $lang . '.txt', implode( "\n", $dictionary[$lang] ), FILE_APPEND ) ){
echo "\rWrote $lang dictionary successfully.\n";
}else{
echo " -- FAILED!\n";
}
}
// @FIXME: move langname dictionaries to langcode dictionaries.s