1. package br.com.thor.tools.dmoz;
  2.  
  3. /**
  4.  * Licensed to the Apache Software Foundation (ASF) under one or more
  5.  * contributor license agreements. See the NOTICE file distributed with
  6.  * this work for additional information regarding copyright ownership.
  7.  * The ASF licenses this file to You under the Apache License, Version 2.0
  8.  * (the "License"); you may not use this file except in compliance with
  9.  * the License. You may obtain a copy of the License at
  10.  *
  11.  * http://www.apache.org/licenses/LICENSE-2.0
  12.  *
  13.  * Unless required by applicable law or agreed to in writing, software
  14.  * distributed under the License is distributed on an "AS IS" BASIS,
  15.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16.  * See the License for the specific language governing permissions and
  17.  * limitations under the License.
  18.  */
  19.  
  20. import java.io.BufferedInputStream;
  21. import java.io.BufferedReader;
  22. import java.io.File;
  23. import java.io.FileInputStream;
  24. import java.io.FilterReader;
  25. import java.io.IOException;
  26. import java.io.InputStreamReader;
  27. import java.io.Reader;
  28. import java.util.StringTokenizer;
  29. import java.util.Vector;
  30. import java.util.regex.Pattern;
  31.  
  32. import javax.xml.parsers.ParserConfigurationException;
  33. import javax.xml.parsers.SAXParser;
  34. import javax.xml.parsers.SAXParserFactory;
  35.  
  36. import org.apache.xerces.util.XMLChar;
  37. import org.xml.sax.Attributes;
  38. import org.xml.sax.InputSource;
  39. import org.xml.sax.Locator;
  40. import org.xml.sax.SAXException;
  41. import org.xml.sax.SAXParseException;
  42. import org.xml.sax.XMLReader;
  43. import org.xml.sax.helpers.DefaultHandler;
  44.  
  45. /** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
  46. public class DmozToFlat {
  47. long pages = 0;
  48.  
  49. /**
  50. * This filter fixes characters that might offend our parser. This lets us
  51. * be tolerant of errors that might appear in the input XML.
  52. */
  53. private static class XMLCharFilter extends FilterReader {
  54. private boolean lastBad = false;
  55.  
  56. public XMLCharFilter(Reader reader) {
  57. super(reader);
  58. }
  59.  
  60. public int read() throws IOException {
  61. int c = in.read();
  62. int value = c;
  63. if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
  64. value = 'X';
  65. else if (lastBad && c == '<') { // fix mis-matched brackets
  66. in.mark(1);
  67. if (in.read() != '/')
  68. value = 'X';
  69. in.reset();
  70. }
  71. lastBad = (c == 65533);
  72.  
  73. return value;
  74. }
  75.  
  76. public int read(char[] cbuf, int off, int len) throws IOException {
  77. int n = in.read(cbuf, off, len);
  78. if (n != -1) {
  79. for (int i = 0; i < n; i++) {
  80. char c = cbuf[off + i];
  81. char value = c;
  82. if (!(XMLChar.isValid(c))) // fix invalid characters
  83. value = 'X';
  84. else if (lastBad && c == '<') { // fix mis-matched brackets
  85. if (i != n - 1 && cbuf[off + i + 1] != '/')
  86. value = 'X';
  87. }
  88. lastBad = (c == 65533);
  89. cbuf[off + i] = value;
  90. }
  91. }
  92. return n;
  93. }
  94. }
  95.  
  96. /**
  97. * The RDFProcessor receives tag messages during a parse of RDF XML data. We
  98. * build whatever structures we need from these messages.
  99. */
  100. private class RDFProcessor extends DefaultHandler {
  101. String curURL = null, curSection = null;
  102. boolean titlePending = false, descPending = false,
  103. insideAdultSection = false;
  104. Pattern topicPattern = null;
  105. StringBuffer title = new StringBuffer(), desc = new StringBuffer();
  106. XMLReader reader;
  107. int subsetDenom;
  108. int hashSkew;
  109. Locator location;
  110. private boolean halfBollean;
  111.  
  112. /**
  113. * Pass in an XMLReader, plus a flag as to whether we should include
  114. * adult material.
  115. */
  116. public RDFProcessor(XMLReader reader, int subsetDenom, int skew,
  117. Pattern topicPattern) throws IOException {
  118. this.reader = reader;
  119. this.subsetDenom = subsetDenom;
  120. this.topicPattern = topicPattern;
  121. }
  122.  
  123. //
  124. // Interface ContentHandler
  125. //
  126.  
  127. /**
  128. * Start of an XML elt
  129. */
  130. public void startElement(String namespaceURI, String localName,
  131. String qName, Attributes atts) throws SAXException {
  132. if ("Topic".equals(qName)) {
  133. String category = atts.getValue("r:id");
  134. category = category.replaceAll("Top/", "");
  135. if(category == null || category.trim().length() == 0)
  136. return;
  137. StringTokenizer stringTokenizer = new StringTokenizer(category,"/");
  138. String firstLevel = stringTokenizer.nextToken();
  139. String secondLevel = "Others";
  140. if(stringTokenizer.hasMoreTokens()){
  141. secondLevel = stringTokenizer.nextToken();
  142. }
  143. curSection = firstLevel+"/"+secondLevel;
  144.  
  145. } else if ("ExternalPage".equals(qName)) {
  146.  
  147. if (!curSection.startsWith("Sports")) {
  148. return;
  149. }
  150.  
  151. // Subset denominator filter.
  152. // Only emit with a chance of 1/denominator.
  153. String url = atts.getValue("about");
  154.  
  155. if(halfBollean){
  156. halfBollean = false;
  157. return;
  158. }
  159.  
  160. // We actually claim the URL!
  161. curURL = url;
  162. halfBollean = true;
  163.  
  164. } else if (curURL != null && "d:Title".equals(qName)) {
  165. titlePending = true;
  166. } else if (curURL != null && "d:Description".equals(qName)) {
  167. descPending = true;
  168. }
  169. }
  170.  
  171. /**
  172. * The contents of an XML elt
  173. */
  174. public void characters(char ch[], int start, int length) {
  175. if (titlePending && title.length() == 0) {
  176. String str = new String(ch, start, length);
  177. str = str.replace('\n', ' ');
  178. str = str.replace('\t', ' ');
  179. title.append(str);
  180. } else if (descPending && desc.length() == 0) {
  181. String str = new String(ch, start, length);
  182. str = str.replace('\n', ' ');
  183. str = str.replace('\t', ' ');
  184. desc.append(str);
  185. }
  186. }
  187.  
  188. /**
  189. * Termination of XML elt
  190. */
  191. public void endElement(String namespaceURI, String localName,
  192. String qName) throws SAXException {
  193. if (curURL != null) {
  194. if ("ExternalPage".equals(qName)) {
  195. //
  196. // Inc the number of pages, insert the page, and
  197. // possibly print status.
  198. //
  199. System.out.println(curURL + "\tDMOZCategory=" + curSection
  200. + "\ttitle=" + title + "\tdesc=" + desc);
  201. pages++;
  202.  
  203. //
  204. // Clear out the link text. This is what
  205. // you would use for adding to the linkdb.
  206. //
  207. if (title.length() > 0) {
  208. title.delete(0, title.length());
  209. }
  210. if (desc.length() > 0) {
  211. desc.delete(0, desc.length());
  212. }
  213.  
  214. // Null out the URL.
  215. curURL = null;
  216. } else if ("d:Title".equals(qName)) {
  217. titlePending = false;
  218. } else if ("d:Description".equals(qName)) {
  219. descPending = false;
  220. }
  221. }
  222. }
  223.  
  224. /**
  225. * When parsing begins
  226. */
  227. public void startDocument() {
  228.  
  229. }
  230.  
  231. /**
  232. * When parsing ends
  233. */
  234. public void endDocument() {
  235.  
  236. }
  237.  
  238. /**
  239. * From time to time the Parser will set the "current location" by
  240. * calling this function. It's useful for emitting locations for error
  241. * messages.
  242. */
  243. public void setDocumentLocator(Locator locator) {
  244. location = locator;
  245. }
  246.  
  247. //
  248. // Interface ErrorHandler
  249. //
  250.  
  251. /**
  252. * Emit the exception message
  253. */
  254. public void error(SAXParseException spe) {
  255.  
  256. }
  257.  
  258. /**
  259. * Emit the exception message, with line numbers
  260. */
  261. public void fatalError(SAXParseException spe) {
  262.  
  263. }
  264.  
  265. /**
  266. * Emit exception warning message
  267. */
  268. public void warning(SAXParseException spe) {
  269.  
  270. }
  271. }
  272.  
  273. /**
  274. * Iterate through all the items in this structured DMOZ file. Add each URL
  275. * to the web db.
  276. */
  277. public void parseDmozFile(File dmozFile, int subsetDenom, int skew,
  278. Pattern topicPattern)
  279. throws IOException, SAXException, ParserConfigurationException {
  280.  
  281. SAXParserFactory parserFactory = SAXParserFactory.newInstance();
  282. SAXParser parser = parserFactory.newSAXParser();
  283. XMLReader reader = parser.getXMLReader();
  284.  
  285. // Create our own processor to receive SAX events
  286. RDFProcessor rp = new RDFProcessor(reader, subsetDenom, skew,
  287. topicPattern);
  288. reader.setContentHandler(rp);
  289. reader.setErrorHandler(rp);
  290.  
  291. //
  292. // Open filtered text stream. The TextFilter makes sure that
  293. // only appropriate XML-approved Text characters are received.
  294. // Any non-conforming characters are silently skipped.
  295. //
  296. XMLCharFilter in = new XMLCharFilter(new BufferedReader(
  297. new InputStreamReader(new BufferedInputStream(
  298. new FileInputStream(dmozFile)), "UTF-8")));
  299. try {
  300. InputSource is = new InputSource(in);
  301. reader.parse(is);
  302. } catch (Exception e) {
  303. System.out.println(e);
  304. System.exit(0);
  305. } finally {
  306. in.close();
  307. }
  308. }
  309.  
  310. static void addTopicsFromFile(String topicFile, Vector<String> topics)
  311. throws IOException {
  312. BufferedReader in = null;
  313. try {
  314. in = new BufferedReader(new InputStreamReader(new FileInputStream(
  315. topicFile), "UTF-8"));
  316. String line = null;
  317. while ((line = in.readLine()) != null) {
  318. topics.addElement(new String(line));
  319. }
  320. } catch (Exception e) {
  321. System.exit(0);
  322. } finally {
  323. in.close();
  324. }
  325. }
  326.  
  327. /**
  328. * Command-line access. User may add URLs via a flat text file or the
  329. * structured DMOZ file. By default, we ignore Adult material (as
  330. * categorized by DMOZ).
  331. */
  332. public static void main(String argv[]) throws Exception {
  333. if (argv.length < 1) {
  334. System.err
  335. .println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
  336. return;
  337. }
  338.  
  339. //
  340. // Parse the command line, figure out what kind of
  341. // URL file we need to load
  342. //
  343. int subsetDenom = 1;
  344. int skew = 0;
  345. String dmozFile = argv[0];
  346. Pattern topicPattern = null;
  347. Vector<String> topics = new Vector<String>();
  348.  
  349. for (int i = 1; i < argv.length; i++) {
  350. if ("-subset".equals(argv[i])) {
  351. subsetDenom = Integer.parseInt(argv[i + 1]);
  352. i++;
  353. } else if ("-topic".equals(argv[i])) {
  354. topics.addElement(argv[i + 1]);
  355. i++;
  356. } else if ("-topicFile".equals(argv[i])) {
  357. addTopicsFromFile(argv[i + 1], topics);
  358. i++;
  359. } else if ("-skew".equals(argv[i])) {
  360. skew = Integer.parseInt(argv[i + 1]);
  361. i++;
  362. }
  363. }
  364.  
  365. DmozToFlat parser = new DmozToFlat();
  366.  
  367. if (!topics.isEmpty()) {
  368. String regExp = new String("^(");
  369. int j = 0;
  370. for (; j < topics.size() - 1; ++j) {
  371. regExp = regExp.concat(topics.get(j));
  372. regExp = regExp.concat("|");
  373. }
  374. regExp = regExp.concat(topics.get(j));
  375. regExp = regExp.concat(").*");
  376. topicPattern = Pattern.compile(regExp);
  377. }
  378.  
  379. parser.parseDmozFile(new File(dmozFile), subsetDenom, skew,
  380. topicPattern);
  381. }
  382.  
  383. }
  384.  
  385.