Many changes. Work in progress, do not use.
This commit is contained in:
parent
95d23fe4fe
commit
fa093edfc8
2 changed files with 340 additions and 81 deletions
|
|
@ -6,94 +6,177 @@
|
|||
|
||||
define( 'REPORTING_INTERVAL', 10 );
|
||||
|
||||
function dumpHTML( $dest, $start ) {
|
||||
global $wgUser, $wgTitle, $wgArticle, $wgEnablePersistentLC, $wgLinkCache, $wgOut;
|
||||
global $wgMakeDumpLinks, $wgStylePath, $wgArticlePath, $wgUploadPath, $wgLogo;
|
||||
$wgMakeDumpLinks = true;
|
||||
$wgScriptPath = "../../..";
|
||||
$wgStylePath = "$wgScriptPath/skins";
|
||||
$wgUploadPath = "$wgScriptPath/images";
|
||||
$wgLogo = "$wgStylePath/common/images/wiki.png";
|
||||
$wgArticlePath = '../../$1';
|
||||
$dbr =& wfGetDB( DB_SLAVE );
|
||||
$end = $dbr->selectField( 'page', 'max(page_id)', false );
|
||||
|
||||
/*global $wgValidSkinNames;
|
||||
var_dump( $wgValidSkinNames );
|
||||
exit;*/
|
||||
require_once( 'includes/ImagePage.php' );
|
||||
require_once( 'includes/CategoryPage.php' );
|
||||
|
||||
print("Creating static HTML dump. Starting from page_id $start of $end.\n");
|
||||
class DumpHTML {
|
||||
var $dest, $interwiki, $depth, $sharedStaticPath;
|
||||
|
||||
$wgUser = new User;
|
||||
$wgUser->setOption( 'skin', 'htmldump' );
|
||||
$sk =& $wgUser->getSkin();
|
||||
|
||||
if ( !is_dir( $dest ) ) {
|
||||
if ( !mkdir( $dest, 0755 ) ) {
|
||||
print("Can't make directory $dir, exiting\n");
|
||||
return;
|
||||
function DumpHTML( $dest, $interwiki = true, $depth = 3 ) {
|
||||
$this->dest = $dest;
|
||||
$this->interwiki = $interwiki;
|
||||
$this->depth = $depth;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a set of articles specified by start and end page_id
|
||||
* Skip categories and images, they will be done separately
|
||||
*/
|
||||
function doArticles( $start, $end = false ) {
|
||||
$fname = 'DumpHTML::doArticles';
|
||||
|
||||
$this->setupGlobals();
|
||||
|
||||
if ( $end === false ) {
|
||||
$dbr =& wfGetDB( DB_SLAVE );
|
||||
$end = $dbr->selectField( 'page', 'max(page_id)', false, $fname );
|
||||
}
|
||||
|
||||
|
||||
for ($id = $start; $id <= $end; $id++) {
|
||||
if ( !($id % REPORTING_INTERVAL) ) {
|
||||
print("$id\n");
|
||||
}
|
||||
$title = Title::newFromID( $id );
|
||||
if ( $title ) {
|
||||
$ns = $title->getNamespace() ;
|
||||
if ( $ns != NS_CATEGORY && $ns != NS_IMAGE ) {
|
||||
$this->doArticle( $title );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function doSpecials() {
|
||||
$this->doMainPage();
|
||||
|
||||
$this->setupGlobals();
|
||||
print "Special:Categories...";
|
||||
$this->doArticle( Title::makeTitle( NS_SPECIAL, 'Categories' ) );
|
||||
print "\n";
|
||||
}
|
||||
|
||||
/** Write the main page as index.html */
|
||||
function doMainPage() {
|
||||
global $wgMakeDumpLinks;
|
||||
|
||||
print "Making index.html ";
|
||||
|
||||
// Set up globals with no ../../.. in the link URLs
|
||||
$this->setupGlobals( 0 );
|
||||
|
||||
// But still use that directory style
|
||||
$wgMakeDumpLinks = 3;
|
||||
|
||||
$title = Title::newMainPage();
|
||||
$text = $this->getArticleHTML( $title );
|
||||
$file = fopen( "{$this->dest}/index.html", "w" );
|
||||
if ( !$file ) {
|
||||
print "\nCan't open index.html for writing\n";
|
||||
return false;
|
||||
}
|
||||
fwrite( $file, $text );
|
||||
fclose( $file );
|
||||
print "\n";
|
||||
}
|
||||
|
||||
function doImageDescriptions() {
|
||||
global $wgSharedUploadDirectory;
|
||||
|
||||
$fname = 'DumpHTML::doImageDescriptions';
|
||||
|
||||
$this->setupGlobals( 3 );
|
||||
|
||||
/**
|
||||
* Dump image description pages that don't have an associated article, but do
|
||||
* have a local image
|
||||
*/
|
||||
$dbr =& wfGetDB( DB_SLAVE );
|
||||
extract( $dbr->tableNames( 'image', 'page' ) );
|
||||
$res = $dbr->select( 'image', array( 'img_name' ), false, $fname );
|
||||
|
||||
$i = 0;
|
||||
print "Writing " . $dbr->numRows( $res ) . " image description pages for local images\n";
|
||||
while ( $row = $dbr->fetchObject( $res ) ) {
|
||||
if ( !( ++$i % REPORTING_INTERVAL ) ) {
|
||||
print "$i\t{$row->img_name}\n";
|
||||
}
|
||||
$title = Title::makeTitle( NS_IMAGE, $row->img_name );
|
||||
if ( $title->getArticleID() ) {
|
||||
// Already done by dumpHTML
|
||||
continue;
|
||||
}
|
||||
$this->doArticle( $title );
|
||||
}
|
||||
/**
|
||||
* Dump images which only have a real description page on commons
|
||||
*/
|
||||
print "Writing description pages for commons images\n";
|
||||
$i = 0;
|
||||
for ( $hash = 0; $hash < 256; $hash++ ) {
|
||||
$dir = sprintf( "%01x/%02x", intval( $hash / 16 ), $hash );
|
||||
$paths = glob( "{$this->sharedStaticPath}/$dir/*" );
|
||||
$paths += glob( "{$this->sharedStaticPath}/thumb/$dir/*" );
|
||||
|
||||
foreach ( $paths as $path ) {
|
||||
$file = basename( $path );
|
||||
if ( !(++$i % REPORTING_INTERVAL ) ) {
|
||||
print "$i\t$file\n";
|
||||
}
|
||||
|
||||
$title = Title::makeTitle( NS_IMAGE, $file );
|
||||
$this->doArticle( $title );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for ($id = $start; $id <= $end; $id++) {
|
||||
if ( !($id % REPORTING_INTERVAL) ) {
|
||||
print("$id\n");
|
||||
|
||||
function doCategories() {
|
||||
$fname = 'DumpHTML::doCategories';
|
||||
$this->setupGlobals();
|
||||
|
||||
$dbr =& wfGetDB( DB_SLAVE );
|
||||
$categorylinks = $dbr->tableName( 'categorylinks' );
|
||||
print "Selecting categories...";
|
||||
$sql = 'SELECT DISTINCT cl_to FROM categorylinks';
|
||||
$res = $dbr->query( $sql, $fname );
|
||||
|
||||
print "\nWriting " . $dbr->numRows( $res ). " category pages\n";
|
||||
$i = 0;
|
||||
while ( $row = $dbr->fetchObject( $res ) ) {
|
||||
if ( !(++$i % REPORTING_INTERVAL ) ) {
|
||||
print "$i\t{$row->cl_to}\n";
|
||||
}
|
||||
$title = Title::makeTitle( NS_CATEGORY, $row->cl_to );
|
||||
$this->doArticle( $title );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Write an article specified by title */
|
||||
function doArticle( $title ) {
|
||||
global $wgTitle, $wgSharedUploadPath, $wgSharedUploadDirectory;
|
||||
global $wgUploadDirectory;
|
||||
|
||||
$wgOut = new OutputPage;
|
||||
$wgOut->setArticleFlag( true );
|
||||
$wgOut->setRobotpolicy( 'index,follow' );
|
||||
|
||||
$wgTitle = Title::newFromID( $id );
|
||||
if ( is_null( $wgTitle ) ) {
|
||||
continue;
|
||||
$text = $this->getArticleHTML( $title );
|
||||
if ( $text === false ) {
|
||||
return;
|
||||
}
|
||||
|
||||
$wgArticle = new Article( $wgTitle );
|
||||
$text = $wgArticle->getContent( true );
|
||||
$wgLinkCache = new LinkCache;
|
||||
$wgLinkCache->forUpdate( true );
|
||||
|
||||
global $wgLinkHolders;
|
||||
$wgLinkHolders = array(
|
||||
'namespaces' => array(),
|
||||
'dbkeys' => array(),
|
||||
'queries' => array(),
|
||||
'texts' => array(),
|
||||
'titles' => array()
|
||||
);
|
||||
# Parse the XHTML to find the images
|
||||
$images = $this->findImages( $text );
|
||||
$this->copyImages( $images );
|
||||
|
||||
|
||||
# Parse the text and replace links with placeholders
|
||||
$wgOut->setPageTitle( $wgTitle->getPrefixedText() );
|
||||
$wgOut->addWikiText( $text );
|
||||
$wgOut->transformBuffer();
|
||||
|
||||
# Execute skin to get complete HTML
|
||||
ob_start();
|
||||
$sk->outputPage( $wgOut );
|
||||
$text = ob_get_contents();
|
||||
ob_end_clean();
|
||||
|
||||
# Write to file
|
||||
$fname = $wgTitle->getHashedFilename();
|
||||
$bits = explode( '/', $fname );
|
||||
$parentDir = "$dest/{$bits[0]}";
|
||||
$fullDir = "$dest/{$bits[0]}/{$bits[1]}";
|
||||
$fullName = "$dest/$fname";
|
||||
$this->writeArticle( $title, $text );
|
||||
}
|
||||
|
||||
if ( !is_dir( $parentDir ) ) {
|
||||
if ( !mkdir( $parentDir, 0744 ) ) {
|
||||
print("Can't write to directory $parentDir\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
if ( !is_dir( $fullDir ) ) {
|
||||
if ( !mkdir( $fullDir, 0744 ) ) {
|
||||
print("Can't write to directory $fullDir\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
/** Write the given text to the file identified by the given title object */
|
||||
function writeArticle( &$title, $text ) {
|
||||
$filename = $title->getHashedFilename();
|
||||
$fullName = "{$this->dest}/$filename";
|
||||
$fullDir = dirname( $fullName );
|
||||
|
||||
wfMkdirParents( $fullDir, 0755 );
|
||||
|
||||
$file = fopen( $fullName, 'w' );
|
||||
if ( !$file ) {
|
||||
|
|
@ -104,7 +187,148 @@ function dumpHTML( $dest, $start ) {
|
|||
fwrite( $file, $text );
|
||||
fclose( $file );
|
||||
}
|
||||
}
|
||||
|
||||
/** Set up globals required for parsing */
|
||||
function setupGlobals( $depth = NULL ) {
|
||||
global $wgUser, $wgTitle, $wgMakeDumpLinks, $wgStylePath, $wgArticlePath;
|
||||
global $wgUploadPath, $wgLogo, $wgMaxCredits, $wgSharedUploadPath;
|
||||
global $wgHideInterlanguageLinks, $wgUploadDirectory, $wgThumbnailScriptPath;
|
||||
global $wgSharedThumbnailScriptPath, $wgEnableParserCache;
|
||||
|
||||
if ( is_null( $depth ) ) {
|
||||
$wgMakeDumpLinks = $this->depth;
|
||||
} else {
|
||||
$wgMakeDumpLinks = $depth;
|
||||
}
|
||||
|
||||
$wgScriptPath = '..' . str_repeat( '/..', $wgMakeDumpLinks );
|
||||
$wgArticlePath = str_repeat( '../', $wgMakeDumpLinks ) . '$1';
|
||||
$wgStylePath = "$wgScriptPath/skins";
|
||||
$wgUploadPath = "$wgScriptPath/images";
|
||||
$wgSharedUploadPath = "$wgUploadPath/shared";
|
||||
$wgLogo = "$wgStylePath/common/images/wiki.png";
|
||||
$wgMaxCredits = -1;
|
||||
$wgHideInterlangageLinks = !$this->interwiki;
|
||||
$wgThumbnailScriptPath = $wgSharedThumbnailScriptPath = false;
|
||||
$wgEnableParserCache = false;
|
||||
|
||||
$wgUser = new User;
|
||||
$wgUser->setOption( 'skin', 'htmldump' );
|
||||
$wgUser->setOption( 'editsection', 0 );
|
||||
|
||||
$this->sharedStaticPath = "$wgUploadDirectory/shared";
|
||||
|
||||
}
|
||||
|
||||
/** Reads the content of a title object, executes the skin and captures the result */
|
||||
function getArticleHTML( &$title ) {
|
||||
global $wgOut, $wgTitle, $wgArticle, $wgUser, $wgUseCategoryMagic;
|
||||
|
||||
$wgOut = new OutputPage;
|
||||
$wgOut->setParserOptions( new ParserOptions );
|
||||
|
||||
$wgTitle =& $title;
|
||||
if ( is_null( $wgTitle ) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$ns = $wgTitle->getNamespace();
|
||||
if ( $ns == NS_SPECIAL ) {
|
||||
SpecialPage::executePath( $wgTitle );
|
||||
} else {
|
||||
if ( $ns == NS_IMAGE ) {
|
||||
$wgArticle = new ImagePage( $wgTitle );
|
||||
} elseif ( $wgUseCategoryMagic && $ns == NS_CATEGORY ) {
|
||||
$wgArticle = new CategoryPage( $wgTitle );
|
||||
} else {
|
||||
$wgArticle = new Article( $wgTitle );
|
||||
}
|
||||
$wgArticle->view();
|
||||
}
|
||||
|
||||
$sk =& $wgUser->getSkin();
|
||||
ob_start();
|
||||
$sk->outputPage( $wgOut );
|
||||
$text = ob_get_contents();
|
||||
ob_end_clean();
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/** Returns image paths used in an XHTML document */
|
||||
function findImages( $text ) {
|
||||
global $wgOutputEncoding, $wgDumpImages;
|
||||
$parser = xml_parser_create( $wgOutputEncoding );
|
||||
xml_set_element_handler( $parser, 'wfDumpStartTagHandler', 'wfDumpEndTagHandler' );
|
||||
|
||||
$wgDumpImages = array();
|
||||
xml_parse( $parser, $text );
|
||||
xml_parser_free( $parser );
|
||||
|
||||
return $wgDumpImages;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy images (or create symlinks) from commons to a static directory.
|
||||
* This is necessary even if you intend to distribute all of commons, because
|
||||
* the directory contents is used to work out which image description pages
|
||||
* are needed.
|
||||
*/
|
||||
function copyImages( $images ) {
|
||||
global $wgSharedUploadPath, $wgSharedUploadDirectory;
|
||||
# Find shared uploads and copy them into the static directory
|
||||
$sharedPathLength = strlen( $wgSharedUploadPath );
|
||||
foreach ( $images as $image => $dummy ) {
|
||||
# Is it shared?
|
||||
if ( substr( $image, 0, $sharedPathLength ) == $wgSharedUploadPath ) {
|
||||
# Reconstruct full filename
|
||||
$rel = substr( $image, $sharedPathLength + 1 ); // +1 for slash
|
||||
$sourceLoc = "$wgSharedUploadDirectory/$rel";
|
||||
$staticLoc = "{$this->sharedStaticPath}/$rel";
|
||||
#print "Copying $sourceLoc to $staticLoc\n";
|
||||
# Copy to static directory
|
||||
if ( !file_exists( $staticLoc ) ) {
|
||||
wfMkdirParents( dirname( $staticLoc ), 0755 );
|
||||
if ( function_exists( 'symlink' ) ) {
|
||||
symlink( $staticLoc, $sourceLoc );
|
||||
} else {
|
||||
copy( $sourceLoc, $staticLoc );
|
||||
}
|
||||
}
|
||||
|
||||
if ( substr( $rel, 0, 6 ) == 'thumb/' ) {
|
||||
# That was a thumbnail
|
||||
# We will also copy the real image
|
||||
$parts = explode( '/', $rel );
|
||||
$rel = "{$parts[1]}/{$parts[2]}/{$parts[3]}";
|
||||
$sourceLoc = "$wgSharedUploadDirectory/$rel";
|
||||
$staticLoc = "{$this->sharedStaticPath}/$rel";
|
||||
#print "Copying $sourceLoc to $staticLoc\n";
|
||||
if ( !file_exists( $staticLoc ) ) {
|
||||
wfMkdirParents( dirname( $staticLoc ), 0755 );
|
||||
if ( function_exists( 'symlink' ) ) {
|
||||
symlink( $staticLoc, $sourceLoc );
|
||||
} else {
|
||||
copy( $sourceLoc, $staticLoc );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** XML parser callback */
|
||||
function wfDumpStartTagHandler( $parser, $name, $attribs ) {
|
||||
global $wgDumpImages;
|
||||
|
||||
if ( $name == 'IMG' && isset( $attribs['SRC'] ) ) {
|
||||
$wgDumpImages[$attribs['SRC']] = true;
|
||||
}
|
||||
}
|
||||
|
||||
/** XML parser callback */
|
||||
function wfDumpEndTagHandler( $parser, $name ) {}
|
||||
|
||||
# vim: syn=php
|
||||
?>
|
||||
|
|
|
|||
|
|
@ -7,13 +7,13 @@
|
|||
|
||||
/** */
|
||||
|
||||
$optionsWithArgs = array( 's', 'd' );
|
||||
$optionsWithArgs = array( 's', 'd', 'e' );
|
||||
|
||||
require_once( "commandLine.inc" );
|
||||
require_once( "dumpHTML.inc" );
|
||||
|
||||
error_reporting( E_ALL & (~E_NOTICE) );
|
||||
|
||||
define( 'CHUNK_SIZE', 50 );
|
||||
|
||||
if ( !empty( $options['s'] ) ) {
|
||||
$start = $options['s'];
|
||||
|
|
@ -21,13 +21,48 @@ if ( !empty( $options['s'] ) ) {
|
|||
$start = 1;
|
||||
}
|
||||
|
||||
if ( !empty( $options['e'] ) ) {
|
||||
$end = $options['e'];
|
||||
} else {
|
||||
$dbr =& wfGetDB( DB_SLAVE );
|
||||
$end = $dbr->selectField( 'page', 'max(page_id)', false );
|
||||
}
|
||||
|
||||
if ( !empty( $options['d'] ) ) {
|
||||
$dest = $options['d'];
|
||||
} else {
|
||||
$dest = 'static';
|
||||
}
|
||||
|
||||
dumpHTML( $dest, $start );
|
||||
$d = new DumpHTML( $dest, true, 3 );
|
||||
|
||||
if ( $options['special'] ) {
|
||||
$d->doSpecials();
|
||||
} elseif ( $options['images'] ) {
|
||||
$d->doImageDescriptions();
|
||||
} elseif ( $options['categories'] ) {
|
||||
$d->doCategories();
|
||||
} else {
|
||||
if ( $end - $start > CHUNK_SIZE * 2 ) {
|
||||
// Split the problem into smaller chunks, run them in different PHP instances
|
||||
// This is a memory/resource leak workaround
|
||||
print("Creating static HTML dump. Starting from page_id $start of $end.\n");
|
||||
chdir( "maintenance" );
|
||||
for ( $chunkStart = $start; $chunkStart < $end; $chunkStart += CHUNK_SIZE ) {
|
||||
$chunkEnd = $chunkStart + CHUNK_SIZE - 1;
|
||||
if ( $chunkEnd > $end ) {
|
||||
$chunkEnd = $end;
|
||||
}
|
||||
passthru( "php dumpHTML.php -s $chunkStart -e $chunkEnd" );
|
||||
}
|
||||
chdir( ".." );
|
||||
$d->doImageDescriptions();
|
||||
$d->doCategories();
|
||||
$d->doMainPage( $dest );
|
||||
} else {
|
||||
$d->doArticles( $start, $end );
|
||||
}
|
||||
}
|
||||
|
||||
exit();
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue