All pastes #2124650 Raw Edit

Unnamed

public text v1 · immutable
#2124650 ·published 2012-03-05 17:49 UTC
rendered paste body
import java.io.*;

/**
 * Enkel tolk f�r datat till Kevin Bacon-uppgiften. Inte p� l�nga v�gar testad
 * tillr�ckligt bra, s� rapportera alla buggar ni hittar.
 * 
 * Anv�ndning �r enkel:
 * <ul>
 * <li>Skapa en instans av BaconReader och skicka med s�kv�gen till filen ni
 * vill l�sa in.
 * <li>Anropa metoden getNextPart tills ni har f�tt in allt ni vill ha in, ni
 * tr�ttnar, eller den returerar null. Det sistn�mnda betyder att filen �r slut.
 * <li>Anropa close f�r att st�nga filen n�r allt ni vill l�sa in �r klart.
 * </ul>
 * 
 * Normalt skulle jag antagligen anv�nt ANTLR f�r den h�r typen av jobb, men
 * ville inte anv�nda externa bibliotek h�r. Regexpar funkar ocks�, men jag
 * ville ha den h�r l�sningsmetoden f�r att kunna anv�nda som exempel.
 * 
 * Till slut ett tips: jobba inte med de fulla filerna n�r ni testar. Det �r
 * drygt 16 miljoner rader i de b�gge filerna actors.list och actresses.list.
 * Kommandona (linux edyl) head och tail kan vara till nytta f�r att skapa mer
 * hanterliga datam�ngder att testa med.
 * 
 * @author henrikbe
 */
public class BaconReader {

	public enum PartType {
		/**
		 * Name of actor or actress
		 */
		NAME,
		/**
		 * Title of movie or show
		 */
		TITLE,
		/**
		 * Year, remember to combine this with the title to get a reasonable ID
		 */
		YEAR,
		/**
		 * Id of a recurring show, might be a date, a name, or any other sort of
		 * ID. The most common one is probably on the format (#1.12) which means
		 * season 1 show 12. You <b>need</b> to use this value to distinguise
		 * between tv shows, otherwise the Bacon number of almost everyone will
		 * be far too low.
		 */
		ID,
		/**
		 * Any extra sort of information. Ignore this.
		 */
		INFO
	}

	public static class Part {
		public PartType type;
		public String text;

		public Part(PartType type, String text) {
			this.type = type;
			this.text = text;
		}

		public String toString() {
			return String.format("[%s, %s]", type, text);
		}
	}

	private boolean atBeginningOfLine;
	private BufferedReader reader;
	private int currentChar;
	private StringBuffer buffer;

	public BaconReader(String file) throws FileNotFoundException, IOException {
		this(new File(file));
	}

	public BaconReader(File file) throws FileNotFoundException, IOException {
		reader = new BufferedReader(new FileReader(file));
		skipToBeginningOfData();
		buffer = new StringBuffer();
	}

	public void close() throws IOException {
		reader.close();
	}

	public Part getNextPart() throws IOException {
		ignoreWhiteSpace();
		switch (currentChar) {
		case -1:
			return null;
		case '(':
			return yearOrExtraInfo();
		case '"':
			return tvTitle();
		case '<':
			return billingPosition();
		case '[':
			return characterName();
		case '{':
			return showId();
		default:
			return nameOrTitle();
		}
	}

	// ////////////////////////////////////////////////////////////////////////
	/*
	 * Alla metoder nedanf�r �r privata och anv�nds f�r att tolka filen. Om du
	 * inte �r intresserad av hur klassen jobbar (enkel recurisve descent) s�
	 * kan allt nedanf�r ignoreras.
	 */
	// ////////////////////////////////////////////////////////////////////////

	private void skipToBeginningOfData() throws IOException {
		String line = reader.readLine();
		while (!line.startsWith("----			------")) {
			line = reader.readLine();
		}
		read();
		atBeginningOfLine = true;
	}

	private void accept() throws IOException {
		buffer.append((char) currentChar);
		read();
	}

	private void accept(char ch) throws IOException {
		if (currentChar == ch) {
			accept();
		} else {
			throw new RuntimeException(
					String
							.format(
									"Illegal character found, was %s (%d), but should have been %s (%d)",
									(char) currentChar, currentChar, ch,
									(int) ch));
		}
	}

	private void acceptTo(char ch) throws IOException {
		while (currentChar != ch) {
			accept();
		}
	}

	private void skip() throws IOException {
		read();
	}

	private void read() throws IOException {
		atBeginningOfLine = currentChar == 10 || currentChar == 13;
		currentChar = reader.read();
	}

	private void ignoreWhiteSpace() throws IOException {
		while (Character.isWhitespace(currentChar)) {
			read();
		}
	}

	private Part getPart(PartType type) {
		Part p = new Part(type, buffer.toString().trim());
		buffer = new StringBuffer();
		return p;
	}

	private Part yearOrExtraInfo() throws IOException {
		skip();
		while (Character.isDigit(currentChar)) {
			accept();
		}
		if (currentChar == ')') {
			skip();
			return getPart(PartType.YEAR);
		}
		acceptTo(')');
		skip();
		return getPart(PartType.INFO);
	}

	private Part tvTitle() throws IOException {
		skip();
		acceptTo('"');
		skip();
		return getPart(PartType.TITLE);
	}

	private Part billingPosition() throws IOException {
		skip();
		acceptTo('>');
		skip();
		return getPart(PartType.INFO);
	}

	private Part characterName() throws IOException {
		skip();
		acceptTo(']');
		skip();
		return getPart(PartType.INFO);
	}

	private Part showId() throws IOException {
		skip();
		acceptTo('}');
		skip();
		return getPart(PartType.ID);
	}

	private Part nameOrTitle() throws IOException {
		if (atBeginningOfLine)
			return name();
		else
			return title();
	}

	private Part name() throws IOException {
		while (currentChar == '-') {
			accept();
		}
		if (buffer.length() > 1) {
			currentChar = -1;
			return null;
		}
		acceptTo('\t');
		return getPart(PartType.NAME);
	}

	private Part title() throws IOException {
		acceptTo('(');
		return getPart(PartType.TITLE);
	}

}