User:Kevin Baas/stat generator code/Main.java
Appearance
import java.io.*;
import java.util.*;
public class Main {
public static CountWords count_words = new CountWords();
public static String read_path = //"C:\\Users\\happyjack27\\Downloads\\Wikipedia-0.7-static-beta2(3).tar\\Wikipedia-0.7-static-beta2\\Wikipedia-0.7-static-beta2";
"C:\\Documents and Settings\\Administrator\\Desktop\\wikipedia-schools\\wp";
public static String write_path =// "C:\\Users\\happyjack27\\Downloads\\Wikipedia-0.7-static-beta2(3).tar\\Wikipedia-0.7-static-beta2";
"C:\\Documents and Settings\\Administrator\\Desktop\\index-results";
public static String start = "bodyContent";//"Schools Wikipedia Selection";
public static String end = "if (window.runOnloadHook)";//"printfooter";
public static int num_articles = 0;
public static Vector<String> articles = new Vector<String>();
public static Vector<Stat> stats = new Vector<Stat>();
//String[] directories = new String[27+10];
//regress to mean main dictionary too - add x usages of each word.
//do this before regressing articles to mean.
public static void main(String[] args) {
count_words.count_path = write_path;
if(false) {
readStats("_term_stats");
for( int i = 0; i < 3; i++) {
for( int j = 20; j < 33; j++) {
for( int k = 0; k < 2; k++) {
Iterator<Stat> ii = stats.iterator();
System.out.println("calculating sort parameter");
while( ii.hasNext()) {
Stat st = ii.next();
st.sort_order = st.vals[j];
st.sort_order /= st.vals[18];// pterm
for( int n = 0; n < i; n++)
st.sort_order *= st.vals[18];// pterm
if( k > 0)
st.sort_order *= -1; //invert asc/desc
}
System.out.println("sorting stat file");
Collections.sort(stats,new StatComparator());
writeStats(stats,"_sorted_stats_"+i+"_"+j+"_"+(k == 0 ? "asc" : "desc"));
}
}
}
}
if(true) {
read_files(read_path);
System.out.println("total num articles: "+num_articles);
//int words_to_add = (count_words.total_word_count / num_articles) / 4;
System.out.println("compressing index");
count_words.compressMainWordIndex(count_words.total_word_count / 50);
System.out.println("writting main counts");
write_file_counts(write_path+"\\");
System.out.println("getting all stats");
double[][][] allstats = count_words.getAllStats();
System.out.println("writting term stats");
try {
FileOutputStream fos = new FileOutputStream(new File(write_path+"\\_term_stats.csv"));
StringBuffer sb = new StringBuffer();
double[][] term_stats = allstats[1];
for( int i = 0; i < count_words.words.length; i++) {
try {
sb.append(count_words.words[i]);
double[] stats = term_stats[i];
for( int j = 0; j < stats.length; j++) {
sb.append(","+stats[j]);
}
sb.append("\n");
} catch (Exception ex) {
System.out.println("ex "+ex);
ex.printStackTrace();
}
}
fos.write(new String(sb).getBytes());
fos.close();
} catch (Exception ex) {
System.out.println("ex "+ex);
ex.printStackTrace();
}
System.out.println("writting doc stats");
try {
FileOutputStream fos = new FileOutputStream(new File(write_path+"\\_doc_stats.csv"));
StringBuffer sb = new StringBuffer();
double[][] term_stats = allstats[0];
for( int i = 0; i < count_words.sarticles.length; i++) {
sb.append(count_words.sarticles[i]);
double[] stats = term_stats[i];
for( int j = 0; j < stats.length; j++) {
sb.append(","+stats[j]);
}
sb.append("\n");
}
fos.write(new String(sb).getBytes());
fos.close();
} catch (Exception ex) { }
//write_file_counts(write_path+"\\");
//write_word_freqs(write_path+"\\"+word_freqs+"\\"); //variance of poisson distribution
//calculating entropy = -sum(global_prob * word count of article * log (article_prob)... / global_prob * toal_word_count
}
System.out.println("DONE.");
}
public static void read_files(String path) {
String[] dirs = new File(path).list();
for( int i = 0; i < dirs.length; i++) {
if( dirs[i].equals("index"))
continue;
System.out.println(dirs[i]);
if(!(new File(path+"\\"+dirs[i]).isDirectory()))
continue; String[] d2 = new File(path+"\\"+dirs[i]).list();
for( int k = 0; k < d2.length; k++) {
if(!(new File(path+"\\"+dirs[i]+"\\"+d2[k]).isDirectory()))
continue;
String[] arts = new File(path+"\\"+dirs[i]+"\\"+d2[k]).list();
for( int j = 0; j < arts.length; j++) {
if( arts[j].indexOf(".htm") < 0)
continue;
if( !arts[j].equals("1_(number).html"))
;//continue;
System.out.println(arts[j]);
File f = new File(path+"\\"+dirs[i]+"\\"+d2[k]+"\\"+arts[j]);
StringBuffer contents = new StringBuffer();
byte[] bb = new byte[(int)(f.length())];//fis.available()];
try {
FileInputStream fis = new FileInputStream(f);
//while(fis.available() > 0) {
//fis.wait()
System.out.print(" "+f.length()+" ");
fis.read(bb);
//contents.append(new String(bb));
//}
fis.close();
} catch (Exception ex) { }
if( arts[j].indexOf(".")>-1)
arts[j] = arts[j].substring(0,arts[j].indexOf("."));
articles.add(arts[j]);
num_articles++;
//if( contents.length() > 0)
String[] words = parseOutWords(arts[j],bb);//contents.toString());
count_words.countArticleWords(arts[j], words);
}
}
}
}
public static String[] parseOutWords(String article, byte[] bb) {
int offset = 0;
byte[] bstart = start.getBytes();
byte[] bend = end.getBytes();
//byte[] non_word = ".,'\"();:/ \n\r#&][".getBytes();
Vector<String> vwords = new Vector<String>();
StringBuffer cur_word = new StringBuffer();
for( ; offset < bb.length; offset++) {
int i = 0;
for( i = 0; i < bstart.length; i++)
if( bb[offset+i] != bstart[i])
break;
if( i == bstart.length) {
offset+=i;
while( bb[offset] != '>')
offset++;
break;
}
}
offset++;
//int word_start = offset;
boolean in_tag = false;
for( ; offset < bb.length; offset++) {
int i = 0;
for( i = 0; i < bend.length; i++)
if( bb[offset+i] != bend[i])
break;
if( i == bend.length) {
break;
}
byte b = bb[offset];
if( in_tag) {
if( b == '>') {
in_tag = false;
//word_start = offset+1;
}
} else {
if( (b >= '0' && b <= '9') || (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')) {
//System.out.print((char)b);
cur_word.append((char)b);
} else {
if( cur_word.length() > 0) {
String s = new String(cur_word);
//System.out.print(" "+s);
vwords.add(s);
cur_word = new StringBuffer();
}
if( b == '<')
in_tag = true;
}
}
}
/*
//System.out.println(contents);
try {
contents = contents.substring(contents.indexOf(start)+11, contents.indexOf(end));
} catch (Exception ex) { }
contents = contents.replaceAll(" "," ");
contents = contents.replaceAll("[.,'\"();:/ \n\r#&]", " ");
contents = contents.replaceAll("[.,'\"();:/ \n\r#&]", " ");
contents = contents.replaceAll("-", " ");
//System.out.println(contents);
String[] c2 = contents.split(">");
Vector<String[]> v = new Vector<String[]>();
int total = 0;
for( int i = 0; i < c2.length; i++) {
try {
String[] c3 = c2[i].split("<");
//System.out.println(c3[0]);
String[] c4 = c3[0].split(" ");
v.add(c4);
total+= c4.length;
} catch (Exception ex) { }
}*/
String[] words = new String[vwords.size()];
for( int i = 0; i < vwords.size(); i++) {
words[i] = vwords.get(i);
//System.out.print(" "+words[i]);
}
/*int cur = 0;
for( int i = 0; i < v.size(); i++) {
String[] ss = v.get(i);
for( int j = 0; j < ss.length; j++) {
if( ss[j].length() == 0)
continue;
//System.out.println(cur+"|"+ss[j]);
String s = ss[j].replaceAll("[.,'\"();:/ \n\r#&]", "").trim();
//System.out.print(s+" ");
words[cur++] = s;
}
}*/
return words;
}
public static void write_file_counts(String path) {
FileOutputStream fos;
try {
fos = new FileOutputStream(new File(path+"\\_words.csv"));
for( int i = 0; i < count_words.words.length; i++)
fos.write((count_words.words[i] + "\n").getBytes());
fos.close();
} catch (Exception ex) { }
try {
fos = new FileOutputStream(new File(path+"\\_word_counts.csv"));
for( int i = 0; i < count_words.values.length; i++)
fos.write((count_words.values[i] + "\n").getBytes());
fos.close();
} catch (Exception ex) { }
try {
fos = new FileOutputStream(new File(path+"\\_word_freqs.csv"));
for( int i = 0; i < count_words.freqs.length; i++)
fos.write((count_words.freqs[i] + "\n").getBytes());
fos.close();
} catch (Exception ex) { }
try {
int words_to_add = count_words.total_word_count / num_articles / 10;
for( int i = 0; i < articles.size(); i++) {
String article = articles.get(i);
System.out.println(".."+article);
try {
fos = new FileOutputStream(new File(path+"\\freqs\\"+article+".csv"));
double[] dd = count_words.getMeanRegressedArticleWordFreq(article, words_to_add);
for( int j = 0; j < dd.length; j++)
fos.write((dd[j] + "\n").getBytes());
fos.close();
} catch (Exception ex) { }
}
} catch (Exception ex) { }
}
public static void readStats(String nn) {
System.out.println("reading stat file "+nn);
stats = new Vector<Stat>();
File f = new File(write_path+"\\"+nn+".csv");
try {
FileInputStream fis = new FileInputStream(f);
//StringBuffer sb = new StringBuffer();
//while( fis.available() > 0) {
byte[] bb = new byte[(int)f.length()];
fis.read(bb);
//sb.append(new String(bb));
//}
fis.close();
String s = new String(bb);
//System.out.println(s);
String[] lines = s.split("\n");
for( int i = 0; i < lines.length; i++) {
String[] fields = lines[i].split(",");
Stat st = new Stat();
st.name = fields[0].trim();
st.vals = new double[fields.length-1];
for( int j = 1; j < fields.length; j++)
st.vals[j-1] = new Double(fields[j].trim());
if( st.vals[18] < 10 || st.vals[19] < 5 || (st.name.charAt(0) >= '0' && st.name.charAt(0) <= '9'))
continue;
stats.add(st);
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
public static void writeStats(Vector<Stat> stats, String nn) {
System.out.println("writting sorted stat file "+nn);
//stats = new Vector<Stat>();
File f = new File(write_path+"\\"+nn+".csv");
try {
Iterator<Stat> ii= stats.iterator();
while( ii.hasNext()) {
StringBuffer sb = new StringBuffer();
Stat st = ii.next();
if(st.vals[10] < 5 || st.vals[9] < 10)
continue;
if(st.name.charAt(0) >= '0' && st.name.charAt(0) <= '9')
continue;
StringBuffer ssb = new StringBuffer();
ssb.append(st.name);
for( int i = 0; i < st.vals.length; i++)
ssb.append(","+st.vals[i]);
ssb.append(","+st.sort_order);
ssb.append("\n");
sb.append(ssb);
fis.write(new String(sb).getBytes());
}
FileOutputStream fis = new FileOutputStream(f);
fis.close();
} catch (Exception ex) {
ex.printStackTrace();
}
}
}