Jump to content

Manual:Chris G's botclasses/ExportAllPagesBot.php

From mediawiki.org

This bot uses Chris G's botclasses to export all pages from a wiki and store them in XML files. Run AllPagesBot.php to generate the text files needed for this bot to work.

Hack

[edit]

To make it work, you'll need to hack botclasses.php so that http is a public rather than private variable. So, change:

class wikipedia {
    private $http;
    private $token;
    private $ecTimestamp;
    public $url;

to:

class wikipedia {
    public $http;
    private $token;
    private $ecTimestamp;
    public $url;

Code

[edit]
<?php
/* ExportAllPagesBot
 * By Leucosticte, https://www.mediawiki.org/wiki/User:Leucosticte
 * GNU Public License 2.0
 *
 * This bot exports all pages from a wiki and stores them in XML files.
 */

/* Setup my classes. */
include( 'botclasses.php' );
$wiki      = new wikipedia;
$wiki->url = "http://en.wikipedia.org/w/api.php";

/* All the login stuff. */
$user = 'REMOVED';
$pass = 'REMOVED';
$wiki->login( $user, $pass );

// Configuration
$dir = "./xmlfiles";
$filePrefix = "$dir/Wikipedia-";
$filePrefixNs6 = "$dir/Wikipedia-Ns6-";
$url1 = "http://en.wikipedia.org/w/index.php?title=Special:Export&pages=";
$url2 = '';
$url3 = "&history&action=submit"; // Get the full history
$pageTitlesFile = 'PageTitles.txt';
$pageTitlesNs6File = 'PageTitlesNs6.txt';
$pagesPerFile = 100; // Put this many pages in each XML file

// Create directory if it doesn't exist
if ( !file_exists( $dir ) ) {
    echo "Creating directory $dir...\n";
    mkdir ( $dir );
}
if ( !is_dir( $dir ) ) {
    die ( "$dir is not a directory\n" );
}

// Test file existence
if ( !file_exists ( $pageTitlesFile ) ) {
    die ( "File $pageTitlesFile not found" );
}
if ( !file_exists ( $pageTitlesNs6File ) ) {
    die ( "File $pageTitlesNs6File not found" );
}

// Read files
$lines = file( $pageTitlesFile, FILE_IGNORE_NEW_LINES );
$linesNs6 = file( $pageTitlesNs6File, FILE_IGNORE_NEW_LINES );

// Iterate over other namespaces, then over the file namespace
iterate( $wiki, $lines, $filePrefix, $pagesPerFile, $url1, $url2, $url3 );
iterate( $wiki, $linesNs6, $filePrefixNs6, $pagesPerFile, $url1, $url2, $url3 );

function iterate( $wiki, $lines, $filePrefix, $pagesPerFile, $url1, $url2, $url3 ) {
    // FIXME: Something is wrong with this formula; so far it hasn't been a big deal, but the
    // file numbering isn't working as desired/expected
    $iterations = count ( $lines ) / $pagesPerFile;
    $digits = strlen ( (string)$iterations );
    $fileNumber = 0; // Incrementing number appended to the filename

    foreach ( $lines as $key => $line ) {
        if ( $url2 ) {
            $url2 .= "%0A";
        }
        $url2 .= urlencode ( $line );
        if ( !( ( $key + 1 ) % $pagesPerFile )
            || $key + 1 == count ( $lines ) ) { // If it divides evenly or this is the last one...
            $fileNumber++; // This is part of the filename
            $fileNumberDigits = strlen ( (string)$fileNumber ); // Number of digits in file name
            $strFileNumber = str_repeat ( '0', $digits - $fileNumberDigits )
                . $fileNumber; // Add leading zeros
            $filename = $filePrefix . $strFileNumber . ".xml";
            $url = $url1 . $url2 . $url3;
            echo "Creating file $filename...\n";
            $f = fopen( $filename, "w" );
            $g = $wiki->http->post( $url, true );
            fwrite( $f, $g );
            fclose( $f );
            $url2 = '';
        }
    }
}