Wikipedia:AutoEd/unicodify.js

From Wikipedia, the free encyclopedia
function autoEdUnicodify(str) { //MAIN FUNCTION describes list of fixes
 
 // Task 1: Replace named html entities with unicode
 
 // Most common replacements
 str = str.replace(/—/gi, '—');
 str = str.replace(/–/gi, '–');
 
 // Case insensitive symbols
 if(str.search(/&[a-z][a-z]+[0-9]*;/i) >= 0) {
  //XML and HTML Symbols
  str = str.replace(/…/gi, '...');
  str = str.replace(/+/gi, '+');
  str = str.replace(/±/gi, '±');
  str = str.replace(/−/gi, '−');
  str = str.replace(/×/gi, '×');
  str = str.replace(/÷/gi, '÷');
  str = str.replace(/≠/gi, '≠');
  str = str.replace(/≈/gi, '≈');
  str = str.replace(/≤/gi, '≤');
  str = str.replace(/≥/gi, '≥');
  str = str.replace(/"/gi, '"'); // "
  str = str.replace(/'/gi, "'"); // '
  str = str.replace(/¡/gi, '¡');
  str = str.replace(/¢/gi, '¢');
  str = str.replace(/£/gi, '£');
  str = str.replace(/¤/gi, '¤');
  str = str.replace(/¥/gi, '¥');
  str = str.replace(/¦/gi, '¦');
  str = str.replace(/§/gi, '§');
  str = str.replace(/¨/gi, '¨');
  str = str.replace(/©/gi, '©');
  str = str.replace(/ª/gi, 'ª');
  str = str.replace(/«/gi, '«');
  str = str.replace(/¬/gi, '¬');
  str = str.replace(/®/gi, '®');
  str = str.replace(/¯/gi, '¯');
  str = str.replace(/°/gi, '°');
  str = str.replace(/²/gi, '²');
  str = str.replace(/³/gi, '³');
  str = str.replace(/´/gi, '´');
  str = str.replace(/µ/gi, 'µ');
  str = str.replace(/¶/gi, '¶');
  str = str.replace(/·/gi, '·');
  str = str.replace(/¸/gi, '¸');
  str = str.replace(/¹/gi, '¹');
  str = str.replace(/º/gi, 'º');
  str = str.replace(/»/gi, '»');
  str = str.replace(/¼/gi, '¼');
  str = str.replace(/½/gi, '½');
  str = str.replace(/¾/gi, '¾');
  str = str.replace(/¿/gi, '¿');
  str = str.replace(/ˆ/gi, 'ˆ');
  str = str.replace(/˜/gi, '˜');
  str = str.replace(/‘/gi, '‘');
  str = str.replace(/’/gi, '’');
  str = str.replace(/‚/gi, '‚');
  str = str.replace(/“/gi, '“');
  str = str.replace(/”/gi, '”');
  str = str.replace(/„/gi, '„');
  str = str.replace(/•/gi, '•');
  str = str.replace(/‰/gi, '‰');
  str = str.replace(/‹/gi, '‹');
  str = str.replace(/›/gi, '›');
  str = str.replace(/‾/gi, '‾');
  str = str.replace(/⁄/gi, '⁄');
  str = str.replace(/€/gi, '€');
  str = str.replace(/ℑ/gi, 'ℑ');
  str = str.replace(/℘/gi, '℘');
  str = str.replace(/ℜ/gi, 'ℜ');
  str = str.replace(/™/gi, '™');
  str = str.replace(/ℵ/gi, 'ℵ');
  str = str.replace(/↵/gi, '↵');
  str = str.replace(/∀/gi, '∀');
  str = str.replace(/∂/gi, '∂');
  str = str.replace(/∃/gi, '∃');
  str = str.replace(/∅/gi, '∅');
  str = str.replace(/∇/gi, '∇');
  str = str.replace(/∈/gi, '∈');
  str = str.replace(/∉/gi, '∉');
  str = str.replace(/∋/gi, '∋');
  str = str.replace(/∏/gi, '∏');
  str = str.replace(/∑/gi, '∑');
  str = str.replace(/∗/gi, '∗');
  str = str.replace(/√/gi, '√');
  str = str.replace(/∝/gi, '∝');
  str = str.replace(/∞/gi, '∞');
  str = str.replace(/∠/gi, '∠');
  str = str.replace(/∧/gi, '∧');
  str = str.replace(/∨/gi, '∨');
  str = str.replace(/∩/gi, '∩');
  str = str.replace(/∪/gi, '∪');
  str = str.replace(/∫/gi, '∫');
  str = str.replace(/∴/gi, '∴');
  str = str.replace(/∼/gi, '∼');
  str = str.replace(/≅/gi, '≅');
  str = str.replace(/⊂/gi, '⊂');
  str = str.replace(/⊃/gi, '⊃');
  str = str.replace(/⊄/gi, '⊄');
  str = str.replace(/⊆/gi, '⊆');
  str = str.replace(/⊇/gi, '⊇');
  str = str.replace(/⊕/gi, '⊕');
  str = str.replace(/⊗/gi, '⊗');
  str = str.replace(/⊥/gi, '⊥');
  str = str.replace(/⋅/gi, '⋅');
  str = str.replace(/⌈/gi, '⌈');
  str = str.replace(/⌉/gi, '⌉');
  str = str.replace(/⌊/gi, '⌊');
  str = str.replace(/⌋/gi, '⌋');
  str = str.replace(/⟨/gi, '〈');
  str = str.replace(/⟩/gi, '〉');
  str = str.replace(/◊/gi, '◊');
  str = str.replace(/♠/gi, '♠');
  str = str.replace(/♣/gi, '♣');
  str = str.replace(/♥/gi, '♥');
  str = str.replace(/♦/gi, '♦');
}
 
// Uppercase symbols
 if(str.search(/&[A-Z][a-z]+;/) >= 0) {
  //Greek symbols
  str = str.replace(/Α/g, 'Α');
  str = str.replace(/Β/g, 'Β');
  str = str.replace(/Γ/g, 'Γ');
  str = str.replace(/Δ/g, 'Δ');
  str = str.replace(/Ε/g, 'Ε');
  str = str.replace(/Ζ/g, 'Ζ');
  str = str.replace(/Η/g, 'Η');
  str = str.replace(/Θ/g, 'Θ');
  str = str.replace(/Ι/g, 'Ι');
  str = str.replace(/Κ/g, 'Κ');
  str = str.replace(/Λ/g, 'Λ');
  str = str.replace(/Μ/g, 'Μ');
  str = str.replace(/Ν/g, 'Ν');
  str = str.replace(/Ξ/g, 'Ξ');
  str = str.replace(/Ο/g, 'Ο');
  str = str.replace(/Π/g, 'Π');
  str = str.replace(/Ρ/g, 'Ρ');
  str = str.replace(/Σ/g, 'Σ');
  str = str.replace(/Τ/g, 'Τ');
  str = str.replace(/Υ/g, 'Υ');
  str = str.replace(/Φ/g, 'Φ');
  str = str.replace(/Χ/g, 'Χ');
  str = str.replace(/Ψ/g, 'Ψ');
  str = str.replace(/Ω/g, 'Ω');
  //Latin symbols
  str = str.replace(/À/g, 'À');
  str = str.replace(/Á/g, 'Á');
  str = str.replace(/Â/g, 'Â');
  str = str.replace(/Ã/g, 'Ã');
  str = str.replace(/Ä/g, 'Ä');
  str = str.replace(/Å/g, 'Å');
  str = str.replace(/Æ/g, 'Æ');
  str = str.replace(/Ç/g, 'Ç');
  str = str.replace(/È/g, 'È');
  str = str.replace(/É/g, 'É');
  str = str.replace(/Ê/g, 'Ê');
  str = str.replace(/Ë/g, 'Ë');
  str = str.replace(/Ì/g, 'Ì');
  str = str.replace(/Í/g, 'Í');
  str = str.replace(/Î/g, 'Î');
  str = str.replace(/Ï/g, 'Ï');
  str = str.replace(/Ñ/g, 'Ñ');
  str = str.replace(/Ò/g, 'Ò');
  str = str.replace(/Ó/g, 'Ó');
  str = str.replace(/Ô/g, 'Ô');
  str = str.replace(/Õ/g, 'Õ');
  str = str.replace(/Ö/g, 'Ö');
  str = str.replace(/Ø/g, 'Ø');
  str = str.replace(/Ù/g, 'Ù');
  str = str.replace(/Ú/g, 'Ú');
  str = str.replace(/Û/g, 'Û');
  str = str.replace(/Ü/g, 'Ü');
  str = str.replace(/Ý/g, 'Ý');
  str = str.replace(/Š/g, 'Š');
  str = str.replace(/Ÿ/g, 'Ÿ');
  //XML and HTML Symbols
  str = str.replace(/‡/g, '‡');
  str = str.replace(/″/g, '″');
}
 
// lowercase symbols
 if(str.search(/&[a-z][a-z]+;/) >= 0) {
  //Greek symbols
  str = str.replace(/α/g, 'α');
  str = str.replace(/β/g, 'β');
  str = str.replace(/γ/g, 'γ');
  str = str.replace(/δ/g, 'δ');
  str = str.replace(/ε/g, 'ε');
  str = str.replace(/ζ/g, 'ζ');
  str = str.replace(/η/g, 'η');
  str = str.replace(/θ/g, 'θ');
  str = str.replace(/ι/g, 'ι');
  str = str.replace(/κ/g, 'κ');
  str = str.replace(/λ/g, 'λ');
  str = str.replace(/μ/g, 'μ');
  str = str.replace(/ν/g, 'ν');
  str = str.replace(/ξ/g, 'ξ');
  str = str.replace(/ο/g, 'ο');
  str = str.replace(/π/g, 'π');
  str = str.replace(/ρ/g, 'ρ');
  str = str.replace(/ς/g, 'ς');
  str = str.replace(/σ/g, 'σ');
  str = str.replace(/τ/g, 'τ');
  str = str.replace(/υ/g, 'υ');
  str = str.replace(/φ/g, 'φ');
  str = str.replace(/χ/g, 'χ');
  str = str.replace(/ψ/g, 'ψ');
  str = str.replace(/ω/g, 'ω');
  str = str.replace(/ϑ/g, 'ϑ');
  str = str.replace(/ϒ/g, 'ϒ');
  str = str.replace(/ϖ/g, 'ϖ');
  //Latin symbols
  str = str.replace(/ß/g, 'ß');
  str = str.replace(/à/g, 'à');
  str = str.replace(/á/g, 'á');
  str = str.replace(/â/g, 'â');
  str = str.replace(/ã/g, 'ã');
  str = str.replace(/ä/g, 'ä');
  str = str.replace(/å/g, 'å');
  str = str.replace(/æ/g, 'æ');
  str = str.replace(/ç/g, 'ç');
  str = str.replace(/è/g, 'è');
  str = str.replace(/é/g, 'é');
  str = str.replace(/ê/g, 'ê');
  str = str.replace(/ë/g, 'ë');
  str = str.replace(/ì/g, 'ì');
  str = str.replace(/í/g, 'í');
  str = str.replace(/î/g, 'î');
  str = str.replace(/ï/g, 'ï');
  str = str.replace(/ð/g, 'ð');
  str = str.replace(/ñ/g, 'ñ');
  str = str.replace(/ò/g, 'ò');
  str = str.replace(/ó/g, 'ó');
  str = str.replace(/ô/g, 'ô');
  str = str.replace(/õ/g, 'õ');
  str = str.replace(/ö/g, 'ö');
  str = str.replace(/ø/g, 'ø');
  str = str.replace(/ù/g, 'ù');
  str = str.replace(/ú/g, 'ú');
  str = str.replace(/û/g, 'û');
  str = str.replace(/ü/g, 'ü');
  str = str.replace(/ý/g, 'ý');
  str = str.replace(/þ/g, 'þ');
  str = str.replace(/ÿ/g, 'ÿ');
  str = str.replace(/œ/g, 'œ');
  str = str.replace(/š/g, 'š');
  str = str.replace(/ƒ/g, 'ƒ');
  //XML and HTML Symbols
  str = str.replace(/†/g, '†');
  str = str.replace(/′/g, '′');
 }
 
 // False positives
 // Breaks large amounts of code which discuss programming/scripting.
 // str = str.replace(/&lt;/gi, '<');
 // str = str.replace(/&gt;/gi, '>');
 // Breaks large number of URLs and discussion of programming/scripting.
 // str = str.replace(/&amp;/gi, '&');
 
 // Arrows
 str = str.replace(/&larr;/g, '←');
 str = str.replace(/&rarr;/g, '→');
 str = str.replace(/&uarr;/g, '↑');
 str = str.replace(/&darr;/g, '↓');
 str = str.replace(/&lArr;/g, '⇐');
 str = str.replace(/&rArr;/g, '⇒');
 str = str.replace(/&uArr;/g, '⇑');
 str = str.replace(/&dArr;/g, '⇓');
 str = str.replace(/&harr;/g, '↔');
 str = str.replace(/&hArr;/g, '⇔');
 str = str.replace(/<==|<--/gi, '←');
 str = str.replace(/==>/gi, '→');
 
 // Specific case
 str = str.replace(/&ETH;/g, 'Ð');
 str = str.replace(/&THORN;/g, 'Þ');
 str = str.replace(/&OElig;/g, 'Œ');
 
 
 // Task 2: Replace numeric html entities with unicode ( User:CharlotteWebb )
 
 // Symbols for which there may be a good reason to obfuscate/escape
 var dont_replace = "|!{}[]=<>";
 
 // START specialreplace function from User:CharlotteWebb
 function specialreplace(ent, base){
  var chr = "";
  var num = parseInt(ent.replace(/[\&\#\;x]/g, ''), base);
  // see [[UTF-16]] for chars outside the BMP
  // try this with Gothic letters at full volume ^_^
  if (num > 0xFFFF) {
   num -= 0x10000;
   chr = String.fromCharCode(0xD800 + (num >> 10), 0xDC00 + (num & 0x3FF));  
  } else {
   chr = String.fromCharCode(num);
  }
  if (dont_replace.indexOf(chr) == -1) {
   str = str.replace(ent, chr, "gi");
  }
 }
 // END specialreplace function
 
 // perform replacement
 if(m = str.match(/\&\#(\d+)\;/g)) {
  for(i = 0; i < m.length; i++) {
   specialreplace(m[i], 10);
  }
 }
 if(m = str.match(/\&\#x([\da-f]+)\;/gi)) {
  for(i = 0; i < m.length; i++) { 
   specialreplace(m[i], 16);
  }
 }
 
 // Task 3: Unprintable control characters [[Windows-1252]] from User:CharlotteWebb
 var failstr = "<!-- AutoEd: rm unicode ctrl char w/no win-1252 mapping, intent unknown -->";
 str = str.replace(/\u0080/g, '€');
 str = str.replace(/\u0081/g, failstr);
 str = str.replace(/\u0082/g, '‚');
 str = str.replace(/\u0083/g, 'ƒ');
 str = str.replace(/\u0084/g, '„');
 str = str.replace(/\u0085/g, '…');
 str = str.replace(/\u0086/g, '†');
 str = str.replace(/\u0087/g, '‡');
 str = str.replace(/\u0088/g, 'ˆ');
 str = str.replace(/\u0089/g, '‰');
 str = str.replace(/\u008a/g, 'Š');
 str = str.replace(/\u008b/g, '‹');
 str = str.replace(/\u008c/g, 'Œ');
 str = str.replace(/\u008d/g, failstr);
 str = str.replace(/\u008e/g, 'Ž');
 str = str.replace(/\u008f/g, failstr);
 str = str.replace(/\u0090/g, failstr);
 str = str.replace(/\u0091/g, '‘');
 str = str.replace(/\u0092/g, '’');
 str = str.replace(/\u0093/g, '“');
 str = str.replace(/\u0094/g, '”');
 str = str.replace(/\u0095/g, '•');
 str = str.replace(/\u0096/g, '–');
 str = str.replace(/\u0097/g, '—');
 str = str.replace(/\u0098/g, '˜');
 str = str.replace(/\u0099/g, '™');
 str = str.replace(/\u009a/g, 'š');
 str = str.replace(/\u009b/g, '›');
 str = str.replace(/\u009c/g, 'œ');
 str = str.replace(/\u009d/g, failstr);
 str = str.replace(/\u009e/g, 'ž');
 str = str.replace(/\u009f/g, 'Ÿ');
 
 return str;
}