Manual:Chris G's botclasses/ExportAllPagesBot.php
Appearance
This bot uses Chris G's botclasses to export all pages from a wiki and store them in XML files. Run AllPagesBot.php to generate the text files needed for this bot to work.
Hack
[edit]To make it work, you'll need to hack botclasses.php so that http is a public rather than private variable. So, change:
class wikipedia {
private $http;
private $token;
private $ecTimestamp;
public $url;
to:
class wikipedia {
public $http;
private $token;
private $ecTimestamp;
public $url;
Code
[edit]<?php
/* ExportAllPagesBot
* By Leucosticte, https://www.mediawiki.org/wiki/User:Leucosticte
* GNU Public License 2.0
*
* This bot exports all pages from a wiki and stores them in XML files.
*/
/* Setup my classes. */
include( 'botclasses.php' );
$wiki = new wikipedia;
$wiki->url = "http://en.wikipedia.org/w/api.php";
/* All the login stuff. */
$user = 'REMOVED';
$pass = 'REMOVED';
$wiki->login( $user, $pass );
// Configuration
$dir = "./xmlfiles";
$filePrefix = "$dir/Wikipedia-";
$filePrefixNs6 = "$dir/Wikipedia-Ns6-";
$url1 = "http://en.wikipedia.org/w/index.php?title=Special:Export&pages=";
$url2 = '';
$url3 = "&history&action=submit"; // Get the full history
$pageTitlesFile = 'PageTitles.txt';
$pageTitlesNs6File = 'PageTitlesNs6.txt';
$pagesPerFile = 100; // Put this many pages in each XML file
// Create directory if it doesn't exist
if ( !file_exists( $dir ) ) {
echo "Creating directory $dir...\n";
mkdir ( $dir );
}
if ( !is_dir( $dir ) ) {
die ( "$dir is not a directory\n" );
}
// Test file existence
if ( !file_exists ( $pageTitlesFile ) ) {
die ( "File $pageTitlesFile not found" );
}
if ( !file_exists ( $pageTitlesNs6File ) ) {
die ( "File $pageTitlesNs6File not found" );
}
// Read files
$lines = file( $pageTitlesFile, FILE_IGNORE_NEW_LINES );
$linesNs6 = file( $pageTitlesNs6File, FILE_IGNORE_NEW_LINES );
// Iterate over other namespaces, then over the file namespace
iterate( $wiki, $lines, $filePrefix, $pagesPerFile, $url1, $url2, $url3 );
iterate( $wiki, $linesNs6, $filePrefixNs6, $pagesPerFile, $url1, $url2, $url3 );
function iterate( $wiki, $lines, $filePrefix, $pagesPerFile, $url1, $url2, $url3 ) {
// FIXME: Something is wrong with this formula; so far it hasn't been a big deal, but the
// file numbering isn't working as desired/expected
$iterations = count ( $lines ) / $pagesPerFile;
$digits = strlen ( (string)$iterations );
$fileNumber = 0; // Incrementing number appended to the filename
foreach ( $lines as $key => $line ) {
if ( $url2 ) {
$url2 .= "%0A";
}
$url2 .= urlencode ( $line );
if ( !( ( $key + 1 ) % $pagesPerFile )
|| $key + 1 == count ( $lines ) ) { // If it divides evenly or this is the last one...
$fileNumber++; // This is part of the filename
$fileNumberDigits = strlen ( (string)$fileNumber ); // Number of digits in file name
$strFileNumber = str_repeat ( '0', $digits - $fileNumberDigits )
. $fileNumber; // Add leading zeros
$filename = $filePrefix . $strFileNumber . ".xml";
$url = $url1 . $url2 . $url3;
echo "Creating file $filename...\n";
$f = fopen( $filename, "w" );
$g = $wiki->http->post( $url, true );
fwrite( $f, $g );
fclose( $f );
$url2 = '';
}
}
}