Jump to content

User:SQLBot/Readref.php

From Wikipedia, the free encyclopedia
<?php
if( !isset($argv[1] ) ) {
	$helptext = "Reference Problem Finder, by SQL@Enwiki\nphp $argv[0] <DumpFile> <OutputFile> <domain> <options>\n* -w = Wikify output\n* -d = Double check against API\n\n";
	die($helptext);
}
if( !isset( $argv[3] ) ) {
	die($helptext);
}
$domain = $argv[3];
$fIn = fopen( $argv[1], "r" );
$fOut = fopen( $argv[2], "w" );

if( in_array( "-w", $argv ) ) {
	$wikify = TRUE;
}

if( in_array( "-d", $argv ) ) {
	$doublecheck = TRUE;
}

$refs = "/(<ref |<ref>)/i";
$reflist = "/(\{\{(reflist|reference|refs|footnotes)|<references)/i";

function GetPage($article) {
	global $domain;
	$url = "http://$domain/w/";
        $article = urlencode($article);
        $request = $url . 'api.php?action=query&prop=revisions&titles=' . $article . '&rvprop=content&format=php';
        $sxGetArticle = file_get_contents($request);
        $sxGetA = unserialize($sxGetArticle);
	$sxGetAID = $sxGetA['query']['pages'];
	$sxGetAID = array_shift($sxGetAID);
	$sxGetAID = array_shift($sxGetAID);
        $sxAText = $sxGetA['query']['pages'][$sxGetAID]['revisions'][0]["*"];
        return($sxAText);
}

function checkArticle( $text ) {
	global $refs, $reflist;
	$text = html_entity_decode( $text );
	if( stripos( $text, "<!--") !== FALSE ) {
		$text = preg_replace( "/\<\!\-\-(.*)\-\-\>/i", "", $text );
	}
	if( stripos( $text, "#REDIRECT" ) !== FALSE ) {
		return( FALSE );
	}
	$hasRef = preg_match( $refs, $text, $mRefs );
	$hasRefList = preg_match( $reflist, $text, $mRefList );

	if( isset( $mRefs[1] ) && !isset( $mRefList[1] ) ) {
		return( TRUE );
	}
}
$num = 0;
$ok = 0;
$prob = 0;
$time_start = microtime(true);
while( !feof( $fIn ) ) {
	$fLine = fgets( $fIn );
	$fLine = rtrim( ltrim( $fLine ) );
	$mTitleF = preg_match( "/\<title\>(.*)\<\/title\>/i", $fLine, $mTitle );
	if($mTitleF) {
		echo "$num [$ok / $prob]: Checking $mTitle[1]... ";
		$title = $mTitle[1];
	}
	unset( $mStartTextFound );
	unset( $mEndTextFound );
	$mStartTextFound = strpos( $fLine, "<text" );
	$mEndTextFound = strpos( $fLine, "</text>" );
	if( $mStartTextFound !== FALSE && $mEndTextFound !== FALSE ) {
		preg_match( "/\<text xml\:space\=\"preserve\">(.*)\<\/text\>/i", $fLine, $mText );
		echo " Got text...";
		if( !checkArticle( $mText[1] ) ) {
			echo " No problems!\n";
			$ok++;
		} else {
			echo " Problem!\n";
			fwrite( $fOut, "$title\n" );
			$prob++;
		}
		$num++;
	} else if ( $mStartTextFound !== FALSE ) {
		unset( $mEndFound ); 
		unset( $aText );
		$aText = $fLine;
		while( !$mEndFound ) {
			$fLine = fgets( $fIn );
			$fLine = rtrim( ltrim( $fLine ) );
			$aText = $aText . $fLine;
			$mEndFound = strpos( $fLine, "</text>" );
		}
		echo " End Found... ";
		preg_match( "/\<text xml\:space\=\"preserve\">(.*)\<\/text\>/i", $aText, $mText );
		echo " Got Text... ";
		if( !checkArticle( $mText[1] ) ) {
			echo " No problems!\n";
			$ok++;
		} else {
                        echo " Problem!\n";
			if( $wikify ) {
				fwrite( $fOut, "* [[$title]]\n" );
			} else {
				fwrite( $fOut, "$title\n" );
			}
			$prob++;
                }
                $num++;
	} else {
	}
}
$time_end = microtime(true);
$time = round($time_end - $time_start, 0);
$nodc[time] = $time;
$nodc[rps] = $num / $time;
$nodc[num] = $num;
$nodc[prob] = $prob;
$nodc[ok] = $ok;
fclose( $fOut );
if( $doublecheck ) {
	echo "\n\nDouble-checking articles!\n\n";
	$articles = file( $argv[2] );
	$fOut = fopen( $argv[2], "w" );
	sort( $articles );
	$num = 0;
	$ok = 0;
	$prob = 0;
	$time_start = microtime(true);
	foreach( $articles as $article) {
		$num++;
		$article = ltrim( rtrim( $article ) );
		if( strpos( $article, "* [[" ) !== FALSE ) {
			preg_match( "/\* \[\[(.*)\]\]/i", $article, $mArticle );
			$article = $mArticle[1];
		}
		echo "$num [$ok / $prob]: $article :";
		$aText = GetPage($article);
		if( !checkArticle( $aText ) ) {
			echo " No problems!\n";
			$ok++;
		} else {
                        echo " Problem!\n";
			if( $wikify ) {
				fwrite( $fOut, "* [[$article]]\n" );
			} else {
				fwrite( $fOut, "$article\n" );
			}
			$prob++;
                }	
	}
	$time_end = microtime(true);
	$time = round($time_end - $time_start, 0);
	$rps = $num / $time;
	$elim = $nodc[prob] - $prob;
	$dcpct = $elim / $nodc[prob];
	$dcpct = round( $dcpct * 100, 0 );
	echo "Processed $num in $time (sec) at about $rps checks per second, with double-checking enabled. DC eliminated $elim ($dcpct%) positives.\n";
}
echo "Processed $nodc[num] in $nodc[time] (sec) at about $nodc[rps] checks per second, with no double-checking.\nRun complete\n";
fclose( $fIn );

?>