Many changes. Work in progress, do not use.

This commit is contained in:
Tim Starling 2005-05-28 07:05:28 +00:00
parent 95d23fe4fe
commit fa093edfc8
2 changed files with 340 additions and 81 deletions

View file

@ -6,94 +6,177 @@
define( 'REPORTING_INTERVAL', 10 );
function dumpHTML( $dest, $start ) {
global $wgUser, $wgTitle, $wgArticle, $wgEnablePersistentLC, $wgLinkCache, $wgOut;
global $wgMakeDumpLinks, $wgStylePath, $wgArticlePath, $wgUploadPath, $wgLogo;
$wgMakeDumpLinks = true;
$wgScriptPath = "../../..";
$wgStylePath = "$wgScriptPath/skins";
$wgUploadPath = "$wgScriptPath/images";
$wgLogo = "$wgStylePath/common/images/wiki.png";
$wgArticlePath = '../../$1';
$dbr =& wfGetDB( DB_SLAVE );
$end = $dbr->selectField( 'page', 'max(page_id)', false );
/*global $wgValidSkinNames;
var_dump( $wgValidSkinNames );
exit;*/
require_once( 'includes/ImagePage.php' );
require_once( 'includes/CategoryPage.php' );
print("Creating static HTML dump. Starting from page_id $start of $end.\n");
class DumpHTML {
var $dest, $interwiki, $depth, $sharedStaticPath;
$wgUser = new User;
$wgUser->setOption( 'skin', 'htmldump' );
$sk =& $wgUser->getSkin();
if ( !is_dir( $dest ) ) {
if ( !mkdir( $dest, 0755 ) ) {
print("Can't make directory $dir, exiting\n");
return;
function DumpHTML( $dest, $interwiki = true, $depth = 3 ) {
$this->dest = $dest;
$this->interwiki = $interwiki;
$this->depth = $depth;
}
/**
* Write a set of articles specified by start and end page_id
* Skip categories and images, they will be done separately
*/
function doArticles( $start, $end = false ) {
$fname = 'DumpHTML::doArticles';
$this->setupGlobals();
if ( $end === false ) {
$dbr =& wfGetDB( DB_SLAVE );
$end = $dbr->selectField( 'page', 'max(page_id)', false, $fname );
}
for ($id = $start; $id <= $end; $id++) {
if ( !($id % REPORTING_INTERVAL) ) {
print("$id\n");
}
$title = Title::newFromID( $id );
if ( $title ) {
$ns = $title->getNamespace() ;
if ( $ns != NS_CATEGORY && $ns != NS_IMAGE ) {
$this->doArticle( $title );
}
}
}
}
function doSpecials() {
$this->doMainPage();
$this->setupGlobals();
print "Special:Categories...";
$this->doArticle( Title::makeTitle( NS_SPECIAL, 'Categories' ) );
print "\n";
}
/** Write the main page as index.html */
function doMainPage() {
global $wgMakeDumpLinks;
print "Making index.html ";
// Set up globals with no ../../.. in the link URLs
$this->setupGlobals( 0 );
// But still use that directory style
$wgMakeDumpLinks = 3;
$title = Title::newMainPage();
$text = $this->getArticleHTML( $title );
$file = fopen( "{$this->dest}/index.html", "w" );
if ( !$file ) {
print "\nCan't open index.html for writing\n";
return false;
}
fwrite( $file, $text );
fclose( $file );
print "\n";
}
function doImageDescriptions() {
global $wgSharedUploadDirectory;
$fname = 'DumpHTML::doImageDescriptions';
$this->setupGlobals( 3 );
/**
* Dump image description pages that don't have an associated article, but do
* have a local image
*/
$dbr =& wfGetDB( DB_SLAVE );
extract( $dbr->tableNames( 'image', 'page' ) );
$res = $dbr->select( 'image', array( 'img_name' ), false, $fname );
$i = 0;
print "Writing " . $dbr->numRows( $res ) . " image description pages for local images\n";
while ( $row = $dbr->fetchObject( $res ) ) {
if ( !( ++$i % REPORTING_INTERVAL ) ) {
print "$i\t{$row->img_name}\n";
}
$title = Title::makeTitle( NS_IMAGE, $row->img_name );
if ( $title->getArticleID() ) {
// Already done by dumpHTML
continue;
}
$this->doArticle( $title );
}
/**
* Dump images which only have a real description page on commons
*/
print "Writing description pages for commons images\n";
$i = 0;
for ( $hash = 0; $hash < 256; $hash++ ) {
$dir = sprintf( "%01x/%02x", intval( $hash / 16 ), $hash );
$paths = glob( "{$this->sharedStaticPath}/$dir/*" );
$paths += glob( "{$this->sharedStaticPath}/thumb/$dir/*" );
foreach ( $paths as $path ) {
$file = basename( $path );
if ( !(++$i % REPORTING_INTERVAL ) ) {
print "$i\t$file\n";
}
$title = Title::makeTitle( NS_IMAGE, $file );
$this->doArticle( $title );
}
}
}
for ($id = $start; $id <= $end; $id++) {
if ( !($id % REPORTING_INTERVAL) ) {
print("$id\n");
function doCategories() {
$fname = 'DumpHTML::doCategories';
$this->setupGlobals();
$dbr =& wfGetDB( DB_SLAVE );
$categorylinks = $dbr->tableName( 'categorylinks' );
print "Selecting categories...";
$sql = 'SELECT DISTINCT cl_to FROM categorylinks';
$res = $dbr->query( $sql, $fname );
print "\nWriting " . $dbr->numRows( $res ). " category pages\n";
$i = 0;
while ( $row = $dbr->fetchObject( $res ) ) {
if ( !(++$i % REPORTING_INTERVAL ) ) {
print "$i\t{$row->cl_to}\n";
}
$title = Title::makeTitle( NS_CATEGORY, $row->cl_to );
$this->doArticle( $title );
}
}
/** Write an article specified by title */
function doArticle( $title ) {
global $wgTitle, $wgSharedUploadPath, $wgSharedUploadDirectory;
global $wgUploadDirectory;
$wgOut = new OutputPage;
$wgOut->setArticleFlag( true );
$wgOut->setRobotpolicy( 'index,follow' );
$wgTitle = Title::newFromID( $id );
if ( is_null( $wgTitle ) ) {
continue;
$text = $this->getArticleHTML( $title );
if ( $text === false ) {
return;
}
$wgArticle = new Article( $wgTitle );
$text = $wgArticle->getContent( true );
$wgLinkCache = new LinkCache;
$wgLinkCache->forUpdate( true );
global $wgLinkHolders;
$wgLinkHolders = array(
'namespaces' => array(),
'dbkeys' => array(),
'queries' => array(),
'texts' => array(),
'titles' => array()
);
# Parse the XHTML to find the images
$images = $this->findImages( $text );
$this->copyImages( $images );
# Parse the text and replace links with placeholders
$wgOut->setPageTitle( $wgTitle->getPrefixedText() );
$wgOut->addWikiText( $text );
$wgOut->transformBuffer();
# Execute skin to get complete HTML
ob_start();
$sk->outputPage( $wgOut );
$text = ob_get_contents();
ob_end_clean();
# Write to file
$fname = $wgTitle->getHashedFilename();
$bits = explode( '/', $fname );
$parentDir = "$dest/{$bits[0]}";
$fullDir = "$dest/{$bits[0]}/{$bits[1]}";
$fullName = "$dest/$fname";
$this->writeArticle( $title, $text );
}
if ( !is_dir( $parentDir ) ) {
if ( !mkdir( $parentDir, 0744 ) ) {
print("Can't write to directory $parentDir\n");
return;
}
}
if ( !is_dir( $fullDir ) ) {
if ( !mkdir( $fullDir, 0744 ) ) {
print("Can't write to directory $fullDir\n");
return;
}
}
/** Write the given text to the file identified by the given title object */
function writeArticle( &$title, $text ) {
$filename = $title->getHashedFilename();
$fullName = "{$this->dest}/$filename";
$fullDir = dirname( $fullName );
wfMkdirParents( $fullDir, 0755 );
$file = fopen( $fullName, 'w' );
if ( !$file ) {
@ -104,7 +187,148 @@ function dumpHTML( $dest, $start ) {
fwrite( $file, $text );
fclose( $file );
}
}
/** Set up globals required for parsing */
function setupGlobals( $depth = NULL ) {
global $wgUser, $wgTitle, $wgMakeDumpLinks, $wgStylePath, $wgArticlePath;
global $wgUploadPath, $wgLogo, $wgMaxCredits, $wgSharedUploadPath;
global $wgHideInterlanguageLinks, $wgUploadDirectory, $wgThumbnailScriptPath;
global $wgSharedThumbnailScriptPath, $wgEnableParserCache;
if ( is_null( $depth ) ) {
$wgMakeDumpLinks = $this->depth;
} else {
$wgMakeDumpLinks = $depth;
}
$wgScriptPath = '..' . str_repeat( '/..', $wgMakeDumpLinks );
$wgArticlePath = str_repeat( '../', $wgMakeDumpLinks ) . '$1';
$wgStylePath = "$wgScriptPath/skins";
$wgUploadPath = "$wgScriptPath/images";
$wgSharedUploadPath = "$wgUploadPath/shared";
$wgLogo = "$wgStylePath/common/images/wiki.png";
$wgMaxCredits = -1;
$wgHideInterlangageLinks = !$this->interwiki;
$wgThumbnailScriptPath = $wgSharedThumbnailScriptPath = false;
$wgEnableParserCache = false;
$wgUser = new User;
$wgUser->setOption( 'skin', 'htmldump' );
$wgUser->setOption( 'editsection', 0 );
$this->sharedStaticPath = "$wgUploadDirectory/shared";
}
/** Reads the content of a title object, executes the skin and captures the result */
function getArticleHTML( &$title ) {
global $wgOut, $wgTitle, $wgArticle, $wgUser, $wgUseCategoryMagic;
$wgOut = new OutputPage;
$wgOut->setParserOptions( new ParserOptions );
$wgTitle =& $title;
if ( is_null( $wgTitle ) ) {
return false;
}
$ns = $wgTitle->getNamespace();
if ( $ns == NS_SPECIAL ) {
SpecialPage::executePath( $wgTitle );
} else {
if ( $ns == NS_IMAGE ) {
$wgArticle = new ImagePage( $wgTitle );
} elseif ( $wgUseCategoryMagic && $ns == NS_CATEGORY ) {
$wgArticle = new CategoryPage( $wgTitle );
} else {
$wgArticle = new Article( $wgTitle );
}
$wgArticle->view();
}
$sk =& $wgUser->getSkin();
ob_start();
$sk->outputPage( $wgOut );
$text = ob_get_contents();
ob_end_clean();
return $text;
}
/** Returns image paths used in an XHTML document */
function findImages( $text ) {
global $wgOutputEncoding, $wgDumpImages;
$parser = xml_parser_create( $wgOutputEncoding );
xml_set_element_handler( $parser, 'wfDumpStartTagHandler', 'wfDumpEndTagHandler' );
$wgDumpImages = array();
xml_parse( $parser, $text );
xml_parser_free( $parser );
return $wgDumpImages;
}
/**
* Copy images (or create symlinks) from commons to a static directory.
* This is necessary even if you intend to distribute all of commons, because
* the directory contents is used to work out which image description pages
* are needed.
*/
function copyImages( $images ) {
global $wgSharedUploadPath, $wgSharedUploadDirectory;
# Find shared uploads and copy them into the static directory
$sharedPathLength = strlen( $wgSharedUploadPath );
foreach ( $images as $image => $dummy ) {
# Is it shared?
if ( substr( $image, 0, $sharedPathLength ) == $wgSharedUploadPath ) {
# Reconstruct full filename
$rel = substr( $image, $sharedPathLength + 1 ); // +1 for slash
$sourceLoc = "$wgSharedUploadDirectory/$rel";
$staticLoc = "{$this->sharedStaticPath}/$rel";
#print "Copying $sourceLoc to $staticLoc\n";
# Copy to static directory
if ( !file_exists( $staticLoc ) ) {
wfMkdirParents( dirname( $staticLoc ), 0755 );
if ( function_exists( 'symlink' ) ) {
symlink( $staticLoc, $sourceLoc );
} else {
copy( $sourceLoc, $staticLoc );
}
}
if ( substr( $rel, 0, 6 ) == 'thumb/' ) {
# That was a thumbnail
# We will also copy the real image
$parts = explode( '/', $rel );
$rel = "{$parts[1]}/{$parts[2]}/{$parts[3]}";
$sourceLoc = "$wgSharedUploadDirectory/$rel";
$staticLoc = "{$this->sharedStaticPath}/$rel";
#print "Copying $sourceLoc to $staticLoc\n";
if ( !file_exists( $staticLoc ) ) {
wfMkdirParents( dirname( $staticLoc ), 0755 );
if ( function_exists( 'symlink' ) ) {
symlink( $staticLoc, $sourceLoc );
} else {
copy( $sourceLoc, $staticLoc );
}
}
}
}
}
}
}
/** XML parser callback */
function wfDumpStartTagHandler( $parser, $name, $attribs ) {
global $wgDumpImages;
if ( $name == 'IMG' && isset( $attribs['SRC'] ) ) {
$wgDumpImages[$attribs['SRC']] = true;
}
}
/** XML parser callback */
function wfDumpEndTagHandler( $parser, $name ) {}
# vim: syn=php
?>

View file

@ -7,13 +7,13 @@
/** */
$optionsWithArgs = array( 's', 'd' );
$optionsWithArgs = array( 's', 'd', 'e' );
require_once( "commandLine.inc" );
require_once( "dumpHTML.inc" );
error_reporting( E_ALL & (~E_NOTICE) );
define( 'CHUNK_SIZE', 50 );
if ( !empty( $options['s'] ) ) {
$start = $options['s'];
@ -21,13 +21,48 @@ if ( !empty( $options['s'] ) ) {
$start = 1;
}
if ( !empty( $options['e'] ) ) {
$end = $options['e'];
} else {
$dbr =& wfGetDB( DB_SLAVE );
$end = $dbr->selectField( 'page', 'max(page_id)', false );
}
if ( !empty( $options['d'] ) ) {
$dest = $options['d'];
} else {
$dest = 'static';
}
dumpHTML( $dest, $start );
$d = new DumpHTML( $dest, true, 3 );
if ( $options['special'] ) {
$d->doSpecials();
} elseif ( $options['images'] ) {
$d->doImageDescriptions();
} elseif ( $options['categories'] ) {
$d->doCategories();
} else {
if ( $end - $start > CHUNK_SIZE * 2 ) {
// Split the problem into smaller chunks, run them in different PHP instances
// This is a memory/resource leak workaround
print("Creating static HTML dump. Starting from page_id $start of $end.\n");
chdir( "maintenance" );
for ( $chunkStart = $start; $chunkStart < $end; $chunkStart += CHUNK_SIZE ) {
$chunkEnd = $chunkStart + CHUNK_SIZE - 1;
if ( $chunkEnd > $end ) {
$chunkEnd = $end;
}
passthru( "php dumpHTML.php -s $chunkStart -e $chunkEnd" );
}
chdir( ".." );
$d->doImageDescriptions();
$d->doCategories();
$d->doMainPage( $dest );
} else {
$d->doArticles( $start, $end );
}
}
exit();