Jump to content

User:Kevin Baas/stat generator code/Main.java

From Wikipedia, the free encyclopedia
import java.io.*;
import java.util.*;

public class Main {
	public static CountWords count_words = new CountWords();
	public static String read_path = //"C:\\Users\\happyjack27\\Downloads\\Wikipedia-0.7-static-beta2(3).tar\\Wikipedia-0.7-static-beta2\\Wikipedia-0.7-static-beta2";
		"C:\\Documents and Settings\\Administrator\\Desktop\\wikipedia-schools\\wp";
	public static String write_path =// "C:\\Users\\happyjack27\\Downloads\\Wikipedia-0.7-static-beta2(3).tar\\Wikipedia-0.7-static-beta2";
		"C:\\Documents and Settings\\Administrator\\Desktop\\index-results";
	public static String start = "bodyContent";//"Schools Wikipedia Selection";
	public static String end = "if (window.runOnloadHook)";//"printfooter";
	public static int num_articles = 0;
	public static Vector<String> articles = new Vector<String>();
	public static Vector<Stat> stats = new Vector<Stat>();
	//String[] directories = new String[27+10];

	//regress to mean main dictionary too - add x usages of each word.
	//do this before regressing articles to mean.

	public static void main(String[] args) {
		count_words.count_path = write_path;

		if(false) {
			readStats("_term_stats");
			for( int i = 0; i < 3; i++) {
				for( int j = 20; j < 33; j++) {
					for( int k = 0; k < 2; k++) {
						Iterator<Stat> ii = stats.iterator();
						System.out.println("calculating sort parameter");
						while( ii.hasNext()) {
							Stat st = ii.next();
							st.sort_order = st.vals[j];
							st.sort_order /= st.vals[18];// pterm
							for( int n = 0; n < i; n++)
								st.sort_order *= st.vals[18];// pterm
							if( k > 0)
								st.sort_order *= -1; //invert asc/desc
						}
						System.out.println("sorting stat file");
						Collections.sort(stats,new StatComparator());
						writeStats(stats,"_sorted_stats_"+i+"_"+j+"_"+(k == 0 ? "asc" : "desc"));
					}
				}
			}
		}
		if(true) {

		read_files(read_path);
		System.out.println("total num articles: "+num_articles);
		//int words_to_add = (count_words.total_word_count / num_articles) / 4;

		System.out.println("compressing index");
		count_words.compressMainWordIndex(count_words.total_word_count / 50);

		System.out.println("writting main counts");
		write_file_counts(write_path+"\\");

		System.out.println("getting all stats");
		double[][][] allstats = count_words.getAllStats();

		System.out.println("writting term stats");
		try {
			FileOutputStream fos = new FileOutputStream(new File(write_path+"\\_term_stats.csv"));
			StringBuffer sb = new StringBuffer();
			double[][] term_stats = allstats[1];
			for( int i = 0; i < count_words.words.length; i++) {
				try {
					sb.append(count_words.words[i]);
					double[] stats = term_stats[i];
					for( int j = 0; j < stats.length; j++) {
						sb.append(","+stats[j]);
					}
					sb.append("\n");
				} catch (Exception ex) {
					System.out.println("ex "+ex);
					ex.printStackTrace();
				}
			}
			fos.write(new String(sb).getBytes());
			fos.close();
		} catch (Exception ex) {
			System.out.println("ex "+ex);
			ex.printStackTrace();
		}

		System.out.println("writting doc stats");
		try {
			FileOutputStream fos = new FileOutputStream(new File(write_path+"\\_doc_stats.csv"));
			StringBuffer sb = new StringBuffer();
			double[][] term_stats = allstats[0];
			for( int i = 0; i < count_words.sarticles.length; i++) {
				sb.append(count_words.sarticles[i]);
				double[] stats = term_stats[i];
				for( int j = 0; j < stats.length; j++) {
					sb.append(","+stats[j]);
				}
				sb.append("\n");
			}
			fos.write(new String(sb).getBytes());
			fos.close();
		} catch (Exception ex) { }
//write_file_counts(write_path+"\\");
		//write_word_freqs(write_path+"\\"+word_freqs+"\\");  //variance of poisson distribution
		//calculating entropy = -sum(global_prob * word count of article * log (article_prob)... / global_prob * toal_word_count
	}
		System.out.println("DONE.");
	}
	public static void read_files(String path) {
		String[] dirs = new File(path).list();
		for( int i = 0; i < dirs.length; i++) {
			if( dirs[i].equals("index"))
				continue;
			System.out.println(dirs[i]);
			if(!(new File(path+"\\"+dirs[i]).isDirectory()))
				continue;			String[] d2 = new File(path+"\\"+dirs[i]).list();
			for( int k = 0; k < d2.length; k++) {
				if(!(new File(path+"\\"+dirs[i]+"\\"+d2[k]).isDirectory()))
					continue;
				String[] arts = new File(path+"\\"+dirs[i]+"\\"+d2[k]).list();
				for( int j = 0; j < arts.length; j++) {
					if( arts[j].indexOf(".htm") < 0)
						continue;
					if( !arts[j].equals("1_(number).html"))
						;//continue;
					System.out.println(arts[j]);
					File f = new File(path+"\\"+dirs[i]+"\\"+d2[k]+"\\"+arts[j]);
					StringBuffer contents = new StringBuffer();
					byte[] bb = new byte[(int)(f.length())];//fis.available()];
					try {
						FileInputStream fis = new FileInputStream(f);
						//while(fis.available() > 0) {
							//fis.wait()
							System.out.print(" "+f.length()+" ");
							fis.read(bb);
							//contents.append(new String(bb));
						//}
						fis.close();
					} catch (Exception ex) { }
					if( arts[j].indexOf(".")>-1)
						arts[j] = arts[j].substring(0,arts[j].indexOf("."));
					articles.add(arts[j]);
					num_articles++;
					//if( contents.length() > 0)
					String[] words = parseOutWords(arts[j],bb);//contents.toString());
					count_words.countArticleWords(arts[j], words);

				}
			}
		}
	}

	public static String[] parseOutWords(String article, byte[] bb) {
		int offset = 0;
		byte[] bstart = start.getBytes();
		byte[] bend = end.getBytes();
		//byte[] non_word = ".,'\"();:/ \n\r#&][".getBytes();
		Vector<String> vwords = new Vector<String>();
		StringBuffer cur_word = new StringBuffer();
		for( ; offset < bb.length; offset++) {
			int i = 0;
			for( i = 0; i < bstart.length; i++)
				if( bb[offset+i] != bstart[i])
					break;
			if( i == bstart.length) {
				offset+=i;
				while( bb[offset] != '>')
					offset++;
				break;
			}
		}
		offset++;
		//int word_start = offset;
		boolean in_tag = false;
		for( ; offset < bb.length; offset++) {
			int i = 0;
			for( i = 0; i < bend.length; i++)
				if( bb[offset+i] != bend[i])
					break;
			if( i == bend.length) {
				break;
			}
			byte b = bb[offset];
			if( in_tag) {
				if( b == '>') {
					in_tag = false;
					//word_start = offset+1;
				}
			} else {
				if( (b >= '0' && b <= '9') || (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')) {
					//System.out.print((char)b);
					cur_word.append((char)b);
				} else {
					if( cur_word.length() > 0) {
						String s = new String(cur_word);
						//System.out.print(" "+s);
						vwords.add(s);
						cur_word = new StringBuffer();
					}
					if( b == '<')
						in_tag = true;
				}
			}
		}
		/*
		//System.out.println(contents);
		try {
		contents = contents.substring(contents.indexOf(start)+11, contents.indexOf(end));
		} catch (Exception ex) { }
		contents = contents.replaceAll("&#160;"," ");
		contents = contents.replaceAll("[.,'\"();:/ \n\r#&]", " ");
		contents = contents.replaceAll("[.,'\"();:/ \n\r#&]", " ");
		contents = contents.replaceAll("-", " ");
		//System.out.println(contents);
		String[] c2 = contents.split(">");
		Vector<String[]> v = new Vector<String[]>();
		int total = 0;
		for( int i = 0; i < c2.length; i++) {
			try {
				String[] c3 = c2[i].split("<");
				//System.out.println(c3[0]);
				String[] c4 = c3[0].split(" ");
				v.add(c4);
				total+= c4.length;
			} catch (Exception ex) { }
		}*/
		String[] words = new String[vwords.size()];
		for( int i = 0; i < vwords.size(); i++) {
			words[i] = vwords.get(i);
			//System.out.print(" "+words[i]);
		}
		/*int cur = 0;
		for( int i = 0; i < v.size(); i++) {
			String[] ss = v.get(i);
			for( int j = 0; j < ss.length; j++) {
				if( ss[j].length() == 0)
					continue;
				//System.out.println(cur+"|"+ss[j]);
				String s = ss[j].replaceAll("[.,'\"();:/ \n\r#&]", "").trim();
				//System.out.print(s+" ");
				words[cur++] = s;
			}
		}*/
		return words;
	}

	public static void write_file_counts(String path) {
		FileOutputStream fos;
		try {
		fos = new FileOutputStream(new File(path+"\\_words.csv"));
		for( int i = 0; i < count_words.words.length; i++)
			fos.write((count_words.words[i] + "\n").getBytes());
		fos.close();
		} catch (Exception ex) { }
		try {
		fos = new FileOutputStream(new File(path+"\\_word_counts.csv"));
		for( int i = 0; i < count_words.values.length; i++)
			fos.write((count_words.values[i] + "\n").getBytes());
		fos.close();
		} catch (Exception ex) { }
		try {
		fos = new FileOutputStream(new File(path+"\\_word_freqs.csv"));
		for( int i = 0; i < count_words.freqs.length; i++)
			fos.write((count_words.freqs[i] + "\n").getBytes());
		fos.close();
		} catch (Exception ex) { }
		try {
		int words_to_add = count_words.total_word_count / num_articles / 10;
		for( int i = 0; i < articles.size(); i++) {
			String article = articles.get(i);
			System.out.println(".."+article);
			try {
			fos = new FileOutputStream(new File(path+"\\freqs\\"+article+".csv"));
			double[] dd = count_words.getMeanRegressedArticleWordFreq(article, words_to_add);
			for( int j = 0; j < dd.length; j++)
				fos.write((dd[j] + "\n").getBytes());
			fos.close();
		} catch (Exception ex) { }
		}
		} catch (Exception ex) { }
	}

	public static void readStats(String nn) {
		System.out.println("reading stat file "+nn);
		stats = new Vector<Stat>();
		File f = new File(write_path+"\\"+nn+".csv");
		try {
			FileInputStream fis = new FileInputStream(f);
			//StringBuffer sb = new StringBuffer();
			//while( fis.available() > 0) {
				byte[] bb = new byte[(int)f.length()];
				fis.read(bb);
				//sb.append(new String(bb));
			//}
			fis.close();
			String s = new String(bb);
			//System.out.println(s);
			String[] lines = s.split("\n");
			for( int i = 0; i < lines.length; i++) {
				String[] fields = lines[i].split(",");
				Stat st = new Stat();
				st.name = fields[0].trim();
				st.vals = new double[fields.length-1];
				for( int j = 1; j < fields.length; j++)
					st.vals[j-1] = new Double(fields[j].trim());
				if( st.vals[18] < 10 || st.vals[19] < 5 || (st.name.charAt(0) >= '0' && st.name.charAt(0) <= '9'))
					continue;
				stats.add(st);
			}
		} catch (Exception ex) {
			ex.printStackTrace();
		}
	}

	public static void writeStats(Vector<Stat> stats, String nn) {
		System.out.println("writting sorted stat file "+nn);
		//stats = new Vector<Stat>();
		File f = new File(write_path+"\\"+nn+".csv");
		try {
			Iterator<Stat> ii= stats.iterator();
			while( ii.hasNext()) {
				StringBuffer sb = new StringBuffer();
				Stat st = ii.next();
				if(st.vals[10] < 5 || st.vals[9] < 10)
					continue;
				if(st.name.charAt(0) >= '0' && st.name.charAt(0) <= '9')
					continue;
				StringBuffer ssb = new StringBuffer();
				ssb.append(st.name);
				for( int i = 0; i < st.vals.length; i++)
					ssb.append(","+st.vals[i]);
				ssb.append(","+st.sort_order);
				ssb.append("\n");
				sb.append(ssb);
				fis.write(new String(sb).getBytes());
			}
			FileOutputStream fis = new FileOutputStream(f);
			fis.close();
		} catch (Exception ex) {
			ex.printStackTrace();
		}
	}
}