Code-Bin #957

El
Posted by: Dani | November 17, 2010 @ 7:30pm

Java 5 Code

package br.com.thor.tools.dmoz;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilterReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.regex.Pattern;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.xerces.util.XMLChar;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
public class DmozToFlat {
	long pages = 0;

/**
	 * This filter fixes characters that might offend our parser. This lets us
	 * be tolerant of errors that might appear in the input XML.
	 */
	private static class XMLCharFilter extends FilterReader {
		private boolean lastBad = false;

public XMLCharFilter(Reader reader) {
			super(reader);
		}

public int read() throws IOException {
			int c = in.read();
			int value = c;
			if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
				value = 'X';
			else if (lastBad && c == '<') { // fix mis-matched brackets
				in.mark(1);
				if (in.read() != '/')
					value = 'X';
				in.reset();
			}
			lastBad = (c == 65533);

return value;
		}

public int read(char[] cbuf, int off, int len) throws IOException {
			int n = in.read(cbuf, off, len);
			if (n != -1) {
				for (int i = 0; i < n; i++) {
					char c = cbuf[off + i];
					char value = c;
					if (!(XMLChar.isValid(c))) // fix invalid characters
						value = 'X';
					else if (lastBad && c == '<') { // fix mis-matched brackets
						if (i != n - 1 && cbuf[off + i + 1] != '/')
							value = 'X';
					}
					lastBad = (c == 65533);
					cbuf[off + i] = value;
				}
			}
			return n;
		}
	}

/**
	 * The RDFProcessor receives tag messages during a parse of RDF XML data. We
	 * build whatever structures we need from these messages.
	 */
	private class RDFProcessor extends DefaultHandler {
		String curURL = null, curSection = null;
		boolean titlePending = false, descPending = false,
				insideAdultSection = false;
		Pattern topicPattern = null;
		StringBuffer title = new StringBuffer(), desc = new StringBuffer();
		XMLReader reader;
		int subsetDenom;
		int hashSkew;
		Locator location;
		private boolean halfBollean;

/**
		 * Pass in an XMLReader, plus a flag as to whether we should include
		 * adult material.
		 */
		public RDFProcessor(XMLReader reader, int subsetDenom, int skew,
				Pattern topicPattern) throws IOException {
			this.reader = reader;
			this.subsetDenom = subsetDenom;
			this.topicPattern = topicPattern;
		}

//
		// Interface ContentHandler
		//

/**
		 * Start of an XML elt
		 */
		public void startElement(String namespaceURI, String localName,
				String qName, Attributes atts) throws SAXException {
			if ("Topic".equals(qName)) {
				String category  = atts.getValue("r:id");
				category = category.replaceAll("Top/", "");
				if(category == null || category.trim().length() == 0)
					return;
				StringTokenizer stringTokenizer = new StringTokenizer(category,"/");
				String firstLevel = stringTokenizer.nextToken();
				String secondLevel = "Others";
				if(stringTokenizer.hasMoreTokens()){
					secondLevel = stringTokenizer.nextToken();
				}
				curSection = firstLevel+"/"+secondLevel;
				
			} else if ("ExternalPage".equals(qName)) {
				
				if (!curSection.startsWith("Sports")) {
					return;
				}
				
				// Subset denominator filter.
				// Only emit with a chance of 1/denominator.
				String url = atts.getValue("about");
				
				if(halfBollean){
					halfBollean = false;
					return;
				}
				
				// We actually claim the URL!
				curURL = url;
				halfBollean = true;

} else if (curURL != null && "d:Title".equals(qName)) {
				titlePending = true;
			} else if (curURL != null && "d:Description".equals(qName)) {
				descPending = true;
			}
		}

/**
		 * The contents of an XML elt
		 */
		public void characters(char ch[], int start, int length) {
			if (titlePending && title.length() == 0) {
				String str = new String(ch, start, length);
				str = str.replace('\n', ' ');
				str = str.replace('\t', ' ');
				title.append(str);
			} else if (descPending && desc.length() == 0) {
				String str = new String(ch, start, length);
				str = str.replace('\n', ' ');
				str = str.replace('\t', ' ');
				desc.append(str);
			}
		}

/**
		 * Termination of XML elt
		 */
		public void endElement(String namespaceURI, String localName,
				String qName) throws SAXException {
			if (curURL != null) {
				if ("ExternalPage".equals(qName)) {
					//
					// Inc the number of pages, insert the page, and
					// possibly print status.
					//
					System.out.println(curURL + "\tDMOZCategory=" + curSection
							+ "\ttitle=" + title + "\tdesc=" + desc);
					pages++;

//
					// Clear out the link text. This is what
					// you would use for adding to the linkdb.
					//
					if (title.length() > 0) {
						title.delete(0, title.length());
					}
					if (desc.length() > 0) {
						desc.delete(0, desc.length());
					}

// Null out the URL.
					curURL = null;
				} else if ("d:Title".equals(qName)) {
					titlePending = false;
				} else if ("d:Description".equals(qName)) {
					descPending = false;
				}
			}
		}

/**
		 * When parsing begins
		 */
		public void startDocument() {

}

/**
		 * When parsing ends
		 */
		public void endDocument() {

}

/**
		 * From time to time the Parser will set the "current location" by
		 * calling this function. It's useful for emitting locations for error
		 * messages.
		 */
		public void setDocumentLocator(Locator locator) {
			location = locator;
		}

//
		// Interface ErrorHandler
		//

/**
		 * Emit the exception message
		 */
		public void error(SAXParseException spe) {

}

/**
		 * Emit the exception message, with line numbers
		 */
		public void fatalError(SAXParseException spe) {

}

/**
		 * Emit exception warning message
		 */
		public void warning(SAXParseException spe) {

}
	}

/**
	 * Iterate through all the items in this structured DMOZ file. Add each URL
	 * to the web db.
	 */
	public void parseDmozFile(File dmozFile, int subsetDenom, int skew,
			Pattern topicPattern)
	throws IOException, SAXException, ParserConfigurationException {

SAXParserFactory parserFactory = SAXParserFactory.newInstance();
		SAXParser parser = parserFactory.newSAXParser();
		XMLReader reader = parser.getXMLReader();

// Create our own processor to receive SAX events
		RDFProcessor rp = new RDFProcessor(reader, subsetDenom, skew,
				topicPattern);
		reader.setContentHandler(rp);
		reader.setErrorHandler(rp);

//
		// Open filtered text stream. The TextFilter makes sure that
		// only appropriate XML-approved Text characters are received.
		// Any non-conforming characters are silently skipped.
		//
		XMLCharFilter in = new XMLCharFilter(new BufferedReader(
				new InputStreamReader(new BufferedInputStream(
						new FileInputStream(dmozFile)), "UTF-8")));
		try {
			InputSource is = new InputSource(in);
			reader.parse(is);
		} catch (Exception e) {
			System.out.println(e);
			System.exit(0);
		} finally {
			in.close();
		}
	}

static void addTopicsFromFile(String topicFile, Vector<String> topics)
			throws IOException {
		BufferedReader in = null;
		try {
			in = new BufferedReader(new InputStreamReader(new FileInputStream(
					topicFile), "UTF-8"));
			String line = null;
			while ((line = in.readLine()) != null) {
				topics.addElement(new String(line));
			}
		} catch (Exception e) {
			System.exit(0);
		} finally {
			in.close();
		}
	}

/**
	 * Command-line access. User may add URLs via a flat text file or the
	 * structured DMOZ file. By default, we ignore Adult material (as
	 * categorized by DMOZ).
	 */
	public static void main(String argv[]) throws Exception {
		if (argv.length < 1) {
			System.err
					.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
			return;
		}

//
		// Parse the command line, figure out what kind of
		// URL file we need to load
		//
		int subsetDenom = 1;
		int skew = 0;
		String dmozFile = argv[0];
		Pattern topicPattern = null;
		Vector<String> topics = new Vector<String>();

for (int i = 1; i < argv.length; i++) {
			if ("-subset".equals(argv[i])) {
				subsetDenom = Integer.parseInt(argv[i + 1]);
				i++;
			} else if ("-topic".equals(argv[i])) {
				topics.addElement(argv[i + 1]);
				i++;
			} else if ("-topicFile".equals(argv[i])) {
				addTopicsFromFile(argv[i + 1], topics);
				i++;
			} else if ("-skew".equals(argv[i])) {
				skew = Integer.parseInt(argv[i + 1]);
				i++;
			}
		}

DmozToFlat parser = new DmozToFlat();

if (!topics.isEmpty()) {
			String regExp = new String("^(");
			int j = 0;
			for (; j < topics.size() - 1; ++j) {
				regExp = regExp.concat(topics.get(j));
				regExp = regExp.concat("|");
			}
			regExp = regExp.concat(topics.get(j));
			regExp = regExp.concat(").*");
			topicPattern = Pattern.compile(regExp);
		}

parser.parseDmozFile(new File(dmozFile), subsetDenom, skew,
				topicPattern);
	}

}

Syntax Highlighting

[Open in new window]

Author Comments

none

Rating

4.39 / 8
36 Votes