User:Dr pda/generatestats.js

From Wikipedia, the free encyclopedia
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
 //<pre>
 //This script generates a list of the ten shortest and ten longest articles which transclude a template,
 //e.g. {{featured article}}, calculates some statistics and plots a histogram.
 //To use this function add {{subst:js|User:Dr pda/generatestats.js}} to your monobook.js
 //then go to http://en.wikipedia.org/w/index.php?title=User:Dr_pda/generatestats&action=edit
 //See the talk page for documentation.
 
 function keyValuePair(key,value){
  this.key = key;
  this.value = value;
 }
 
 
 function sortByValue(a, b){
  return a.value - b.value
 }
 
 function getBestScale(min,max){
   scales = new Array(0.2,0.5,1,2,5,10,20,25,50,100,200,250,500,1000,2000,5000);
   var val = (max-min)/15;
   for(var x in scales){
    if (scales[x]-val >= 0) return scales[x];
   }
   return 5000;
 }
 
 function loadXMLDocPassingTemplate(url,handler,template)
 {
   // branch for native XMLHttpRequest object
   if (window.XMLHttpRequest) {
       var req = new XMLHttpRequest();
   }
   // branch for IE/Windows ActiveX version
   else if (window.ActiveXObject) {
      var req = new ActiveXObject("Microsoft.XMLHTTP");
  }
  if (req) {
   req.onreadystatechange = function () {handler(req,template)};
   req.open("GET", url, true);
   req.send("");
  }
 }
 
 function getSizeFromAPI(req,template) {
     // only if req shows "loaded"
     if (req.readyState == 4) {
       // only if "OK"
       if (req.status == 200) {
         // ...processing statements go here...
         if(useTalkCategory || useTemplateCategory) jobsLeft--;
 	 var response = req.responseXML.documentElement;
         var pages = response.getElementsByTagName('page');

         if(pages.length > 0){
 
           for(var i=0;i<pages.length; i++){
             pagesList[index++] = new keyValuePair(pages[i].getAttribute('title'),pages[i].getAttribute('length'));
           }
             document.getElementById('wpTextbox1').value = 'Retrieved ' + index + ' articles.\n To abort click the back button in your browser.';

 	   //Check for more pages
 	   var embeddedin = response.getElementsByTagName('embeddedin');
 	   if(embeddedin.length > 0){
 	     var geicontinue = embeddedin[0].getAttribute('geicontinue');
             if(useTalkCategory || useTemplateCategory) jobsLeft++;
               loadXMLDocPassingTemplate(queryURL+'&geicontinue='+geicontinue,getSizeFromAPI,template);
           }
 	   //If last page retrieved then start processing
           else if(jobsLeft == 0){
 	     //If using wiki text size
             if(document.location.href.indexOf('prosesize') == -1){
               sortAndMakeChart();
             }
 	     //If using readable prose size (WARNING:Will load every page which transcludes template. Could be thousands of pages!!)
 	     else{
 		for(var x in pagesList){
                  var titleURL = encodeURIComponent(pagesList[x].key.replace(/ /g,'_'));                  
 		  loadXMLDocPassingTemplate('/w/index.php?action=render&title='+titleURL,getProseSizeFromPage,pagesList[x].key);
 		}
 	     }
           }
 	 }
       } else {
             alert("There was a problem retrieving the XML data:\n" +
                 req.statusText);
       }
    }
 } 
 
 function getArticlePageFromTalkPage(req,template) {
     // only if req shows "loaded"
     if (req.readyState == 4) {
       // only if "OK"
       if (req.status == 200) {
         // ...processing statements go here...
 	 var response = req.responseXML.documentElement;
         var pages = response.getElementsByTagName('page');
 
         if(pages.length > 0){
           for(var i=0;i<pages.length; i++){
             articleList.push(pages[i].getAttribute('subjectid'));
           }

           var categorymembers = response.getElementsByTagName('categorymembers');
 	   if(categorymembers.length > 0){
 	     var gcmcontinue = categorymembers[0].getAttribute('gcmcontinue');
             loadXMLDocPassingTemplate(talkQueryURL+'&gcmcontinue='+gcmcontinue,getArticlePageFromTalkPage,template);
           }
           //All pages retrieved
           else{
             var pageIds='';
             for(i in articleList){
               //API limited to 50 titles per query
               if( i%50 == 0 && i>0){
                 pageIds = pageIds.substr(1);
	         jobsLeft++;
                 loadXMLDocPassingTemplate(queryURL+pageIds,getSizeFromAPI,template);
                 pageIds='';
               }
               pageIds += '|' + articleList[i];
             }
             //Process remainder
             pageIds = pageIds.substr(1);
	     jobsLeft++;
             loadXMLDocPassingTemplate(queryURL+pageIds,getSizeFromAPI,template);
           }
 	 }
       } else {
             alert("There was a problem retrieving the XML data:\n" +
                 req.statusText);
       }
    }
 } 
 
 function getPagesFromTemplateCategory(req,template) {
     // only if req shows "loaded"
     if (req.readyState == 4) {
       // only if "OK"
       if (req.status == 200) {
         // ...processing statements go here...
 	 var response = req.responseXML.documentElement;
         var pages = response.getElementsByTagName('page');
 
         if(pages.length > 0){
           for(var i=0;i<pages.length; i++){
             articleList.push(encodeURIComponent(pages[i].getAttribute('title')));
           }
 
           var categorymembers = response.getElementsByTagName('categorymembers');
 	   if(categorymembers.length > 0){
 	     var gcmcontinue = categorymembers[0].getAttribute('gcmcontinue');
             loadXMLDocPassingTemplate(templateQueryURL+'&gcmcontinue='+gcmcontinue,getPagesFromTemplateCategory,template);
           }
           //All pages retrieved
           else{
             for(i in articleList){
               //API embeddedin query can only take one title
 	       jobsLeft++;
               loadXMLDocPassingTemplate(queryURL+articleList[i],getSizeFromAPI,template);
             }
           }
 	 }
       } else {
             alert("There was a problem retrieving the XML data:\n" +
                 req.statusText);
       }
    }
 } 
 
 function getProseSizeFromPage(req,title) {
     // only if req shows "loaded"
     if (req.readyState == 4) {
       // only if "OK"
       if (req.status == 200) {
         // ...processing statements go here...
 	 var response = req.responseText;
 
         var start = response.indexOf('<p>',-1);
 	 var stop = 0;
 	 var proseSize = 0;
 
 	 while(start > -1){
 	   stop = response.indexOf('</p>',start);
 	   var para = response.substring(start+3,stop);
 	   para = para.replace(/\[\d{1,3}\]/g,'');
 	   para = para.replace(/citation needed/g,'');
 	   para = para.replace(/(<([^>]+)>)/ig,'');
 	   proseSize += para.length;
 	   start = response.indexOf('<p>',stop);
 	 }
         proseList[proseIndex++] = new keyValuePair(title,proseSize);
         document.getElementById('wpTextbox1').value = 'Retrieved prose size for ' + proseIndex + ' out of ' + index + ' articles.\n To abort click the back button in your browser.';
 	 //If last page retrieved then start processing
         if(proseIndex == index){
 	   pagesList = proseList;
 	   sortAndMakeChart();
         }
 
       } else {
             alert("There was a problem retrieving the XML data:\n" +
                 req.statusText);
       }
    }
 } 
 
 
 function sortAndMakeChart(){
   pagesList.sort(sortByValue);
   //Get top ten and bottom ten
   var bottomTen = '===Ten shortest articles===\n';
   for(var i=0;i<10;i++){
     bottomTen += ('# [[' + pagesList[i].key + ']] (' + Math.round(pagesList[i].value/1024) + ' kB)\n');
   }
 
   pagesList.reverse();
   var topTen = '===Ten longest articles===\n';
   for(var i=0;i<10;i++){
     topTen += ('# [[' + pagesList[i].key + ']] (' + Math.round(pagesList[i].value/1024) + ' kB)\n');
   }

   var list = '===List of articles by size===\n';
   if(document.location.href.indexOf('&list') != -1){
     for(var i=0;i<pagesList.length;i++){
       list += ('# [[' + pagesList[i].key + ']] (' + Math.round(pagesList[i].value/1024) + ' kB)\n');
     }
   }
 
   //Get Range
   var max = Math.ceil(pagesList[0].value/1024);
   var min = Math.floor(pagesList[pagesList.length-1].value/1024);
   var xScale = getBestScale(min,max);
   max = Math.ceil(max/xScale)*xScale;
   min = Math.floor(min/xScale)*xScale;
   var numBins = (max - min)/xScale;
 
   //Calculate statistics
   var sum = 0.0;
   var bins = new Array(numBins);
   for(var i=0;i<numBins;i++){
     bins[i]=0;
   }
   for(var i=0;i<pagesList.length;i++){
     sum += pagesList[i].value*1.0;
     bins[Math.floor((pagesList[i].value/1024-min)/(xScale*1.0))]++;
   }
 
   var mean = (sum/(pagesList.length*1024)).toFixed(3);
   var median = (pagesList[Math.floor(pagesList.length/2)+1].value/1024).toFixed(3);
 
   var statistics = '===Statistics===\n*Number of articles: '+pagesList.length+'\n*Mean: '+mean+' kB\n*Median: '+median+' kB\n';
 
   //Calculate best vertical scale
   var yMax = Math.max.apply(Math,bins);
   var yScale = getBestScale(0,yMax);
   yScale = Math.max(1,yScale);
   yMax = Math.ceil(yMax/yScale)*yScale;
   var verticalScale = '\nScaleMajor = gridcolor:darkgrey increment:' + yScale + ' start:0';
   if(Math.floor(yScale/2) == yScale/2) verticalScale += '\nScaleMinor = gridcolor:lightgrey increment:' + yScale/2 + ' start:0'
 	   //Draw chart
   var chart = '===Chart===\n<timeline>\nColors=\n  id:lightgrey  value:gray(0.8)\n  id:darkgrey  value:gray(0.8)\n  id:white value:rgb(1,1,1)\n  id:steel value:rgb(0.6,0.7,0.8)\n\nImageSize  = width:auto height:303 barincrement:25\nPlotArea   = left:50 bottom:50 top:30 right:30\nDateFormat = x.y\nPeriod     = from:0 till:' + yMax +'\nTimeAxis   = orientation:vertical\nAlignBars  = early'+ verticalScale +'\nBackgroundColors = canvas:white\n\nPlotData=\n  color:steel width:20 align:left\n';
   for(var i=0;i<numBins;i++){
     chart += '  bar:'+(min+i*xScale)+' from:0 till:'+bins[i]+'\n';
   }
   //Add axis label
   chart += '  bar:'+(min + Math.floor(2*numBins/5)*xScale)+' at:0 text:"Article size in kB" shift:(0,-30)\n\n</timeline>';

   if(document.location.href.indexOf('&list') != -1){ 
     document.getElementById('wpTextbox1').value = topTen + '\n' + bottomTen + '\n' + statistics + '\n' + chart + '\n' + list;
   }
   else{
     document.getElementById('wpTextbox1').value = topTen + '\n' + bottomTen + '\n' + statistics + '\n' + chart;
   }
   document.getElementById('wpPreview').click(); 
 
 }
 
 
 function generateStatistics(){
  pagesList = new Array();
  index = 0;
  proseList = new Array();
  proseIndex = 0;
  articleList = new Array();
  template ='';
  queryURL ='';
  talkQueryURL ='';
  templateQueryURL ='';
  jobsLeft = 0;
  namespace = '0';

  useTalkCategory = (document.location.href.indexOf('usetalkcategory') != -1) ? true : false;
  useTemplateCategory = (document.location.href.indexOf('usetemplatecategory') != -1) ? true : false;
  specifyNamespace = (document.location.href.indexOf('specifynamespace') != -1) ? true : false;

  if(specifyNamespace){
    namespace=prompt("Enter the number of the namespace the pages are in\n (0=article, 2=User, 4=Wikipedia etc)","");
  }

  if(useTalkCategory){
    template=prompt("Enter the talk page category you want to check for\n (Don't include Category:)","");
    template = "Category:"+template.toUpperCase().substr(0,1)+template.substr(1);
    talkQueryURL = '/w/api.php?action=query&generator=categorymembers&gcmtitle=' + template + '&gcmlimit=500&gcmnamespace=1&prop=info&inprop=subjectid&format=xml';
    queryURL = '/w/api.php?action=query&prop=info&format=xml&pageids=';
    loadXMLDocPassingTemplate(talkQueryURL,getArticlePageFromTalkPage,template);
  }
  else if(useTemplateCategory){
    template=prompt("Enter the template category you want to check\n (Don't include Category:)","");
    template = "Category:"+template.toUpperCase().substr(0,1)+template.substr(1);
    templateQueryURL = '/w/api.php?action=query&generator=categorymembers&gcmtitle=' + template + '&gcmlimit=500&gcmnamespace=10&prop=info&format=xml';
    queryURL = '/w/api.php?action=query&generator=embeddedin&geilimit=500&geinamespace=0&prop=info&format=xml&geititle=';
    loadXMLDocPassingTemplate(templateQueryURL,getPagesFromTemplateCategory,template);
  }
  else{ 
    template=prompt("Enter the template you want to check for\n (Don't include Template:)","");
    template = "Template:"+template.toUpperCase().substr(0,1)+template.substr(1);
    queryURL = '/w/api.php?action=query&generator=embeddedin&geititle=' + template + '&geilimit=500&geinamespace=' + namespace + '&prop=info&format=xml';
    loadXMLDocPassingTemplate(queryURL,getSizeFromAPI,template);
  }
  document.getElementById('wpTextbox1').value = 'Started.';
 } 
 
 addOnloadHook(function () {
   if(document.location.href.indexOf('User:Dr_pda/generatestats&action=edit') != -1){
     generateStatistics();
   }
 });
 
 //</pre>