Robson » Code Viewer
- Path: script/checker.php
- Lines: 594
- Size: 19.06kB
<? define(VERSION, '0.1.0'); /* About ----- Wikipedia Article Checker by Icey http://iceyboard.no-ip.org/projects/wac/ Released under the GNU General PUblic License http://www.gnu.org/copyleft/gpl.html Info ---- Checks Wikipedia articles for errors. Released under the GNU-GPL http://www.gnu.org/copyleft/gpl.html The implementation of some sections of the script are based on the javascript peer reviewer by AndyZ http://en.wikipedia.org/wiki/User:AndyZ/peerreviewer.js To-do ----- * Lead length * Standard abbreviations * Check article name appears in first sentence and is bold * Check for one-sentence paragraphs * Check for first or second person * Find where mismatched template and link brackets are */?><html><head><title>Wikipedia Article Checker</title><link rel="stylesheet" type="text/css" href="style.css" /></head><body><div id="infobox"><p><a href="http://iceyboard.no-ip.org/projects/wac/">Wikipedia Article Checker</a></p><ul><li>V<?=VERSION?></li><li>Written by <a href="http://en.wikipedia.org/wiki/User:Icey">Icey</a></li><li>Written for <a href="http://en.wikipedia.org">en.wikipedia.org</a></li></ul><p>Instructions</p><ol><li>Go to the edit page of the article you want to check.</li><li>Copy the article text.</li><li>Paste it into the box and click 'Check Article'.</li></ol></div><form method="post" action="checker.php"><textarea name="page" id="page" rows="10" cols="50" style="width:70%"><? echo $_POxST['page']; ?></textarea><br/><input type="submit" value="Check Article" /></p></form><?// check if the user has submitted a pageif ($_POST['page']){ function wp_link($link) { return '<a href="http://en.wikipedia.org/wiki/' . $link . '">' . $link . '</a>'; } // less code later $page = $_POST['page']; // include the fault descriptions include 'faults.php'; $padded_page = str_repeat('.', 40) . $page . str_repeat('.', 40); //**//** check if the article has some images//** if (!strpos(' ' . $page, '[[Image:')) $faults[] .= fault_text('no_images'); //**//** check for accidentally added code//** $accidental_codes = array( "'''Bold text'''", "''Italic text''", '[[Link title]]', '[http://www.example.com link title]', '== Headline text ==', '[[Image:Example.jpg]]', '[[Media:Example.ogg]]', '<math>Insert formula here</math>', '<nowiki>Insert non-formatted text here</nowiki>', '~~~~', '#REDIRECT [[Insert text]]' ); // loop through each accidental code foreach ($accidental_codes as $accidental_code) // check if it's contained in the document if (strpos(strtolower(' ' . $page), strtolower($accidental_code))) // store a fault message if it was found $faults[] = 'The article contains "' . $accidental_code . '", which is default text placed by the edit toolbar.'; //**//** check if the document starts with a heading (is that wrong?!)//** if (substr(trim($page), 0, 2) == '==') $faults[] = fault_text('starts_with_heading'); //**//** day and month link checking//** // list of days and months $days = array('Mon', 'Tues', 'Wednes', 'Thurs', 'Fri', 'Satur', 'Sun'); $months = array('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'); // check if the article contains linked days foreach ($days as $day) if (strpos(strtolower(' ' . $page), strtolower('[[' . $day . 'day]]'))) $faults[] = '"' . $day . 'day" is linked. ' . fault_text('day_or_month_linked'); // check if the article contains linked months foreach ($months as $month) if (strpos(strtolower(' ' . $page), strtolower('[[' . $month . ']]'))) $faults[] = '"' . $month . '" is linked. ' . fault_text('day_or_month_linked'); //**//** check for dates relative to the current day//** $relative_dates = array( 'recently', 'last week', 'this week', 'next week', 'last month', 'this month', 'next month', 'last year', 'this year', 'next year', 'yesterday', 'today', 'tomorrow', 'recently', 'soon', 'modern', 'currently' ); preg_match_all('/.{40}(' . implode('|', $relative_dates) . ').{40}/im', $padded_page, $rel_dates); if (count($rel_dates[0])) { $output = ''; for ($n = 0; $n < count($rel_dates[0]); $n++) $output .= '<li>' . str_replace($rel_dates[1][$n], '<em>' . $rel_dates[1][$n] . '</em>', '...' . htmlspecialchars(trim($rel_dates[0][$n])) . '...') . '</li>'; $faults[] = fault_text('relative_dates') . '<ol>' . $output . '</ol>'; } //** //** check if the article contains an infobox//** if (!strpos(strtolower(' ' . $page), 'infobox')) $faults[] = fault_text('no_infobox'); //** //** check if this looks like a biography//** $bio_items = array( "\[\[category:[ ]*([0-9]+) (births|deaths)\]\]", "{{infobox biography", "{{infobox philosopher", "{{infobox military person", ); foreach ($bio_items as $bio_item) if (eregi($bio_item, strtolower(' ' . $page))) $bio = true; // is it a bio? if ($bio) // check if it contains the persondata template if (!strpos(strtolower(' ' . $page), '{{persondata')) $faults[] = fault_text('no_persondata'); //**//** check measurements are displayed correctly//** // units/weights/mass/area $units = array( 'km', 'kilometer', 'kilometre', 'mile', 'yd', 'yard', 'metre', 'meter', 'ft', 'foot', 'feet', 'inch', 'ins', 'cm', 'centimeter', 'centimetre', 'dm', 'decimal', 'lb', 'pound', 'ton', 'metric ton', 'gram', 'in<sup>(2|3)<\/sup>', '(cubic|squared|square) inch', 'ft<sup>(2|3)<\/sup>', '(cubic|squared|square) (feet|foot)', 'm<sup>(2|3)<\/sup>', '(cubic|squared|square) met', 'km<sup>(2|3)<\/sup>', '(cubic|squared|square) kilomet', 'cm<sup>(2|3)<\/sup>', '(cubic|squared|square) centimet', 'mi<sup>(2|3)<\/sup>', '(cubic|squared|square) mile' ); $mp_page = str_replace(array('[[', ']]'), '', $padded_page); preg_match_all('/(([0-9]+|[0-9]+[,.]+[0-9]+|[0-9]+[,.]+[0-9]+[,.]+[0-9]+)(| )(' . implode('|', $units) . ')(er|re|)(s|))/im', $mp_page, $units_found); if (count($units_found[0])) { $output = ''; for ($n = 0; $n < count($units_found[0]); $n++) $output .= '<li>' .$units_found[0][$n] . '</li>'; $faults[] = fault_text('space_before_unit') . '<ol>' . $output . '</ol>'; } //**//** check for templates that could have a date added//** $templates = array( 'wikify', 'linkless', 'cleanup' ); foreach ($templates as $template) if (strpos(strtolower(' ' . $page), '{{' . $template . '}}')) $faults[] = 'This page contains the ' . $template . ' template. Consider using the dated equivilant: {{' . $template . '-date|Month Year}}'; //**//** check categories are in alphabetical order//** preg_match_all("/\[\[Category\:([^\]]+)\]\]/im", $page, $cats, PREG_SET_ORDER); if (count($cats)) { foreach($cats as $cat) $list_cats[] = $cat[1]; $list_cats_sort = $list_cats; sort($list_cats_sort); if ($list_cats_sort <> $list_cats) $faults[] = 'The categories aren\'t in alphabetical order. They should be like this:<br/>' . '<textarea rows="8" cols="80">[[Category:' . implode($list_cats_sort, ']]' . "\n" . '[[Category:') . ']]</textarea>'; } else $faults[] = fault_text('no_cats'); //**//** check long numbers are seperated by commas//** // removed references, because they frequently contain numbers which shouldn't be converted $cleaned_page = preg_replace('/<ref[^>]*(\/>|>[^<]+<\/ref>)/im', '', $page); // links shouldn't be converted either $cleaned_page = preg_replace('/\[\[([^\]]+)\]\]/im', '', $cleaned_page); $cleaned_page = preg_replace('/\[([^\]]+)\]/im', '', $cleaned_page); // find all numbers in the article (integers and decimals) @preg_match_all("/{1}[0-9,]{1,100}(\.[0-9,]{0,100}|)[0-9]/im", $cleaned_page, $long_numbers, PREG_SET_ORDER); // check if any numbers were found if (count($long_numbers[0])) { // clear the temp output $output = NULL; // loop through each number foreach ($long_numbers as $long_number) { // check it has numbers, and isn't just a comma if (eregi('[0-9]', $long_number[0])) { // store the number $original = $long_number[0]; // remove commas $long_number[0] = str_replace(',', '', $long_number[0]); // check if this is a decimal if (strpos($long_number[0], '.')) { // get the decimal part of the number $decimal = explode('.', $long_number[0]); // slice it into three digit sections // and then combine with the correct comma format $decimal = implode(str_split($decimal[1], 3), ','); // check if the original number was wrong if ($original <> number_format(intval($long_number[0])) . '.' . $decimal) // if so, output the old number and what it should be $output .= '<li>' . $original . ' -> ' . number_format(intval($long_number[0])) . '.' . $decimal . '</li>'; } else { // check if the number is different from the correctly formatted number if ($original <> number_format($long_number[0])) { if ($long_number[0] > 1900 && $long_number[0] <= 2100) // if so, output what it should be $output_year .= '<li>' . $original . ' -> ' . number_format($long_number[0]) . '</li>'; else // if so, output what it should be $output_normal .= '<li>' . $original . ' -> ' . number_format($long_number[0]) . '</li>'; } } } } // add a new fault and show the numbers which are wrong $output = 'Long numbers should be separated with commas. Please convert the following numbers:<br/>'; // check for normal numbers if ($output_normal) $output .= '<ol>'. $output_normal . '</ol>'; // check for years if ($output_year) $output .= "<br/>The following numbers are probably years, they shouldn't be converted.<br/><ol>". $output_year . '</ol>'; $faults[] = $output; } //**//** check large numbers dont have words like "about" before them//** // check for numbers with words like 'about' before them preg_match_all('/(about|approx|approximately) [0-9]{6,100}/im', $page, $about_numbers, PREG_SET_ORDER); // any found? if (count($about_numbers[0])) { // clear the output $output = NULL; // loop through each case found foreach ($about_numbers as $about_number) // output it to a list item $output .= '<li>' . $about_number[0] . '</li>'; // add a new fault with description and list of errornous numbers $faults[] = "Large numbers don't need 'about' or 'approximately' before them. The following break this rule:<br/>" . '<ol>' . $output . '</ol>'; } //**//** check references are after punctuation//** preg_match_all('/.{20}((\]|\'|\.|,|\"|a-zA-Z0-9) <ref|<\/ref>( |)(\.|,)).{20}/im', str_repeat('.', 20) . $page . str_repeat('.', 20), $bad_refs); if (count($bad_refs[0])) { $output = '<ol>'; foreach($bad_refs[0] as $bad_ref) $output .= '<li>' . htmlspecialchars($bad_ref) . '</li>'; $output .= '</ol>'; $faults[] = fault_text('ref_placement') . $output; } //**//** check page length//** if (strlen($page) / 1024 >= 30) $faults[] = fault_text('long_page', round(strlen($page) / 1024, 1)); //**//** check for weasel words//** $weasel_words = array( 'some people say', 'some people said', 'it has been', 'many people believe', 'many scientists believe', 'allege', 'many people say', 'many people said', 'arguably', 'it is claimed', 'correctly', 'apparently', 'people considered', 'many considered', 'is considered', 'are considered', 'some argue', 'contrary to many', 'as opposed to most', 'research has shown', 'is widely regarded as', 'is widely considered to be', 'it is believed that', 'it has been said', 'it has been suggested', 'it has been noticed', 'it has been decided', 'it has been stated', 'they say that', 'could it be that', 'it may be that', 'critics say that', 'experts say that', 'some historians argue', 'considered by many', 'observers say', 'fans say', 'accusations', 'obviously', 'fans say', 'serious scholars', 'serious scientists', 'serious researchers', 'mainstream scholars', 'mainstream scientists', 'mainstream researchers', 'science says', 'medicine says', 'experts suggest' ); preg_match_all('/.{40}(' . implode('|', $weasel_words) . ').{40}/im', $padded_page, $weasel_words_found); if (count($weasel_words_found[0])) { $output = ''; for ($n = 0; $n < count($weasel_words_found[0]); $n++) $output .= '<li>' . str_replace($weasel_words_found[1][$n], '<em>' . $weasel_words_found[1][$n] . '</em>', '...' . trim($weasel_words_found[0][$n]) . '...') . '</li>'; $faults[] = fault_text('weasel_words') . '<ol>' . $output . '</ol>'; } //**//** check for peacock terms//** $peacock_terms =