Robson » Code » PHP » What Links Here Sorter

 
<?
   /*
       What-Links-Here Sorter 1.0 by Robson for Wikipedia
 
       The purpose of this script is to seperate all the links that
         point to a page by their namespace and organise them
         alphabetically. This helps with dab link repair.
 
       Run this script with an article specified, like this:
           http://yourhost/what_links_here_sorter.php?article=Example
 
       Optionally, you can show just specified namespaces, like this:
           http://yourhost/what_links_here_sorter.php?article=Example&show=main,image
       That will just show pages in the 'main' and 'image' namespace.
       You can specify any number of namespaces, separated by comma's.
 
       Example output, which shows 'Pink' for all namespaces:
           http://iceyboard.no-ip.org/includes/code/php/wlhs_pink.html
   */
 
   // if the user specified some namespaces to show
   if ($_GET['show'])
       // convert them into an array
       $show_spaces = explode(',', $_GET['show']);
 
   // get the page from wikipedia, this gets the first 5000 links to the page
   // if there's any more than that you should scream very loudly
   $request = "GET /w/index.php?title=Special:Whatlinkshere/" . $_GET['article'] . "&limit=5000 HTTP/1.1\r\n";
   $request .= "Host: en.wikipedia.org\r\n";
   $request .= "User-agent: Iceys_WhatLinksHere_Sorter /1.0\r\n";
   $request .= "Connection: Close\r\n\r\n";
 
   // open a connection to wikipedia
   $file = fsockopen('en.wikipedia.org', 80);
   // send the request data through, this tells wikipedia what we want
   fwrite($file, $request);
   // loop through the data sent back
   while (!feof($file))
       // store it in a variable
       $wlh .= fgets($file, 128);
   // close the connection when nothing is left
   fclose($file);
 
   // remove everything before the list
   $wlh = substr($wlh, strpos($wlh, '</a>).<ul>')+strlen('</a>).<ul>'), strlen($wlh));
   // remove everything after the list
   $wlh = substr($wlh, 0, strpos($wlh, 'View (')-7);
   // get rid of some tags that we don't need
   $wlh = str_replace(array('<li>', '</li>', '</a>'), '', $wlh);
   // don't need link tags, so remove those as well
   $wlh = eregi_replace('<a[^>]+>', '', $wlh);
 
   // output everything in monospace
   echo '<pre>';
 
   // split all the links into an array
   $wlh = explode("\n", $wlh);
   // the default place to link to
   $linkto = $_GET['article'];
   // loop through each article that links to here
   foreach ($wlh as $link)
   {
       // check if this is the end of a redirect page list
       if (trim(strip_tags($link)) == NULL)
           // if it, reset to the default article
           $linkto = $_GET['article'];
       else
       {
           // check if this is the start of a redirect page list
           if (strpos($link, '<ul>'))
           {
               // split out the redirect page and the firts item
               $split = explode(' (redirect page) <ul>', $link);
               // redirect page
               $linkto = $split[0];
               // first item
               $link = $split[1];
           }
           // store the item in an array with all the other articles that link to that article
           $links[$linkto][] = $link;
       }
   }
 
   // list of namespaces
   $spaces = array(
           'Main', 'Talk',
           'User', 'User_talk',
           'Wikipedia', 'Wikipedia_talk',
           'Image', 'Image_talk',
           'MediaWiki', 'MediaWiki_talk',
           'Template', 'Template_talk',
           'Help', 'Help_talk',
           'Category', 'Category_talk',
           'Portal', 'Portal_talk'
           );
 
   // sort alphabetically by linked page
   ksort($links);
 
   // loops through each redirect page
   foreach($links as $page => $linkto)
   {
       // clear previously saved stuff
       $headers = NULL;
       $html_links = NULL;
       $pages = NULL;
       // output a table
       echo '<table border=1 style="font-size:0.7em">';
       // show what this table is linking to
       echo '<caption style="text-align:left; font-size:1.6em; white-space:pre"><a href="http://en.wikipedia.org/wiki/' . str_replace(' ', '_', $page) . '">' . $page . '</a></caption>';
       // loop through each line
       foreach($linkto as $line)
       {
           // default to main
           $active_space = 'Main';
           // loop through each namespace
           foreach($spaces as $space)
               // check if this page is in this namespace
               if (str_replace(' ', '_', substr($line, 0, strlen($space)+1)) == $space . ':')
                   // if it is, set this as the active namespace
                   $active_space = $space;
           // add this page to it's namespace array
           $pages[$active_space][] = trim($line);
       }
       // loop through each space
       foreach($spaces as $space)
       {
           // check if there's any links in this space
           if (count($pages[$space]))
           {
               // if the user didn't specify places to show
               // or the current space was specified
               if (!$show_spaces || in_array(strtolower($space), $show_spaces))
               {
                   // if there are, make a header
                   $headers .= '<th>' . $space . ' (' . count($pages[$space]) . ')</th>';
                   // and make a cell to store the links
                   $html_links .= '<td valign=top nowrap>';
                   // sort all the links alphabetically
                   sort($pages[$space]);
                   // loop through each one
                   foreach ($pages[$space] as $link)
                       // output it as a link to it's page
                       $html_links .= '<a href="http://en.wikipedia.org/wiki/' . str_replace(' ', '_', $link) . '">' . $link . '</a><br/>';
                   // end the cell
                   $html_links .= '</td>';
               }
           }
       }
       // show the links to the user
       echo '<tr>' . $headers . '</tr><tr>' . $html_links . '</tr></table><br/>';
   }
?>
 
© 2004-18 robson | cc unless stated