From 1dba2aab97bbc4ada2e23b7205882c58afd7825a Mon Sep 17 00:00:00 2001
From: haschart
Date: Fri, 26 Sep 2008 21:17:42 +0000
Subject: [PATCH] Issue number: Changes for 2.4 release Submitted by: Robert
Haschart
---
build.properties | 2 +-
build.xml | 18 +-
changes.txt | 27 +-
src/org/marc4j/ErrorHandler.java | 205 +++
src/org/marc4j/MarcDirStreamReader.java | 145 ++
.../marc4j/MarcPermissiveStreamReader.java | 1523 +++++++++++++++++
src/org/marc4j/MarcStreamReader.java | 311 ++--
src/org/marc4j/MarcXmlParserThread.java | 11 +-
src/org/marc4j/RecordStack.java | 20 +-
src/org/marc4j/converter/CharConverter.java | 27 +-
.../marc4j/converter/impl/AnselToUnicode.java | 375 +++-
src/org/marc4j/converter/impl/CodeTable.java | 168 +-
.../converter/impl/CodeTableHandler.java | 26 +-
.../converter/impl/CodeTableInterface.java | 7 +
.../converter/impl/Iso5426ToUnicode.java | 9 +-
.../converter/impl/Iso6937ToUnicode.java | 9 +-
.../marc4j/converter/impl/UnicodeToAnsel.java | 9 +-
.../converter/impl/UnicodeToIso5426.java | 9 +-
.../converter/impl/UnicodeToIso6937.java | 9 +-
src/org/marc4j/marc/impl/RecordImpl.java | 8 +-
src/org/marc4j/marc/impl/Verifier.java | 16 +-
.../samples/HandleExceptionExample.java | 57 +
.../samples/PermissiveReaderExample.java | 229 +++
.../marc4j/samples/resources/diacritic4.mrc | 1 +
src/org/marc4j/samples/resources/error.mrc | 1 +
25 files changed, 2957 insertions(+), 265 deletions(-)
create mode 100644 src/org/marc4j/ErrorHandler.java
create mode 100644 src/org/marc4j/MarcDirStreamReader.java
create mode 100644 src/org/marc4j/MarcPermissiveStreamReader.java
create mode 100644 src/org/marc4j/converter/impl/CodeTableInterface.java
create mode 100644 src/org/marc4j/samples/HandleExceptionExample.java
create mode 100644 src/org/marc4j/samples/PermissiveReaderExample.java
create mode 100644 src/org/marc4j/samples/resources/diacritic4.mrc
create mode 100644 src/org/marc4j/samples/resources/error.mrc
diff --git a/build.properties b/build.properties
index 5d6f38bf..2c4c6096 100644
--- a/build.properties
+++ b/build.properties
@@ -3,5 +3,5 @@ src.dir=src
build.dir=build
dist.dir=dist
apidoc.dir=apidoc
-version=2.3.2
+version=2.4
project.name=marc4j
\ No newline at end of file
diff --git a/build.xml b/build.xml
index c619e3fc..22995475 100644
--- a/build.xml
+++ b/build.xml
@@ -27,10 +27,22 @@
+
+
+
+
-
-
-
+
+
+
+
+
+
+
+
+
+
+
diff --git a/changes.txt b/changes.txt
index 1f567c7b..9a672dd3 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,9 +1,34 @@
+Changes to MARC4J 2.4
+
+MARC4J 2.4 is a minor release providing some bug fixes and some new functionality.
+
+- Added MarcPermissiveStreamReader which is more capable of reading records that contain structural or
+ encoding errors, and is capable of translating the records to UTF-8 as they are read.
+- Added ErrorHandler which is used for tracking and reporting structural or encoding errors
+ encountered by the MarcPermissiveStreamReader.
+- Added MarcDirStreamReader which iterates over all of the MARC record files in a given directory.
+- Modified MarcStreamReader so that if an exception is thrown for an error in one record you can
+ choose to catch the exception, discard the erroneous record and continue reading from the input file.
+- Modified AnselToUnicode to fix some problems that would occur when trying to handle Chinese characters,
+ to fix an infinite loop problem that would occur sometimes when extraneous characters appear within a
+ MARC8 character set escape sequence, and made many changes to support the MarcPermissiveStreamReader
+ to report and try to recover from encoding errors in the records being read.
+- Modified CodeTable (which is used by AnselToUnicode) so that rather than reading and parsing a large
+ XML file to create the hash tables for mapping MARC8 to Unicode at runtime, the parsing is done once
+ at compile time, and a class that handles the mapping directly via switch statements is automatically
+ generated.
+- Made minor changes to the MarcXmlReader so that if an exception occurs in the MarcXmlParserThread that
+ it starts, the exception is passed to the MarcXmlReader rather than simply hanging the parser thread.
+- Added PermissiveReaderExample which demonstrates how to use the MarcPermissiveReader to examine and/or
+ validate records for structural or encoding errors.
+
+
Changes to MARC4J 2.3.1
MARC4J 2.3.1 is a minor release with some encoding fixes
- Fixed encoding bug in MarcStreamReader: now sets ISO8859_1 as default as alternative for MARC-8 and
- UNIMARC encoding alternative. For MARC 21 the ledare is checked: space is ISO 8859_1 and a is UTF-8.
+ UNIMARC encoding alternative. For MARC 21 the leader is checked: space is ISO 8859_1 and a is UTF-8.
When an encoding is provided in the MarcStreamReader constructor, this encoding overrides
the default encoding and the leader encoding value.
- MarcXmlDriver: when converting from MARC-8 to UTF-8 character coding scheme in leader (pos. 9) is set to 'a'.
diff --git a/src/org/marc4j/ErrorHandler.java b/src/org/marc4j/ErrorHandler.java
new file mode 100644
index 00000000..42041075
--- /dev/null
+++ b/src/org/marc4j/ErrorHandler.java
@@ -0,0 +1,205 @@
+// $Id: ErrorHandler.java,v 1.6 2008/09/26 21:17:42 haschart Exp $
+/**
+ * Copyright (C) 2004 Bas Peters
+ *
+ * This file is part of MARC4J
+ *
+ * MARC4J is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * MARC4J is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with MARC4J; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+package org.marc4j;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Defines and describes errors encountered in the processing a given MARC record.
+ * Used in conjunction with the MarcPermissiveReader class.
+ *
+ * @author Robert Haschart
+ * @version $Revision: 1.6 $
+ */
+public class ErrorHandler {
+
+ public final static int FATAL = 5;
+ public final static int MAJOR_ERROR = 4;
+ public final static int MINOR_ERROR = 3;
+ public final static int ERROR_TYPO = 2;
+ public final static int WARNING = 1;
+ public final static int INFO = 0;
+
+ private List
*
* @author Bas Peters
- * @version $Revision: 1.10 $
+ * @version $Revision: 1.11 $
*
*/
public class MarcStreamReader implements MarcReader {
- private InputStream input = null;
+ private DataInputStream input = null;
private Record record;
@@ -75,8 +84,8 @@ public class MarcStreamReader implements MarcReader {
private String encoding = "ISO8859_1";
private boolean override = false;
-
- private boolean hasNext = true;
+
+ private CharConverter converterAnsel = null;
/**
* Constructs an instance with the specified input stream.
@@ -86,11 +95,10 @@ public MarcStreamReader(InputStream input) {
}
/**
- * Constructs an instance with the specified input stream and character
- * encoding.
+ * Constructs an instance with the specified input stream.
*/
public MarcStreamReader(InputStream input, String encoding) {
- this.input = input;
+ this.input = new DataInputStream(new BufferedInputStream(input));
factory = MarcFactory.newInstance();
if (encoding != null) {
this.encoding = encoding;
@@ -116,117 +124,121 @@ public boolean hasNext() {
*
* @return Record - the record object
*/
- public Record next() {
- Leader ldr;
- int bytesRead = 0;
-
+ public Record next()
+ {
record = factory.newRecord();
try {
byte[] byteArray = new byte[24];
- bytesRead = input.read(byteArray);
-
- if (bytesRead == -1)
- throw new MarcException("no data to read");
-
- while (bytesRead != -1 && bytesRead != byteArray.length)
- bytesRead += input.read(byteArray, bytesRead, byteArray.length
- - bytesRead);
-
- try {
- ldr = parseLeader(byteArray);
- } catch (IOException e) {
- throw new MarcException("error parsing leader with data: "
- + new String(byteArray), e);
- }
-
- // if MARC 21 then check encoding
- switch (ldr.getCharCodingScheme()) {
- case ' ':
- if (!override)
- encoding = "ISO8859_1";
- break;
- case 'a':
- if (!override)
- encoding = "UTF8";
- }
-
- record.setLeader(ldr);
+ input.readFully(byteArray);
- int directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
- if ((directoryLength % 12) != 0)
- throw new MarcException("invalid directory");
- int size = directoryLength / 12;
-
- String[] tags = new String[size];
- int[] lengths = new int[size];
+ int recordLength = parseRecordLength(byteArray);
+ byte[] recordBuf = new byte[recordLength - 24];
+ input.readFully(recordBuf);
+ parseRecord(record, byteArray, recordBuf, recordLength);
+ return(record);
+ }
+ catch (EOFException e) {
+ throw new MarcException("Premature end of file encountered", e);
+ }
+ catch (IOException e) {
+ throw new MarcException("an error occured reading input", e);
+ }
+ }
+
+ private void parseRecord(Record record, byte[] byteArray, byte[] recordBuf, int recordLength)
+ {
+ Leader ldr;
+ ldr = factory.newLeader();
+ ldr.setRecordLength(recordLength);
+ int directoryLength=0;
+
+ try {
+ parseLeader(ldr, byteArray);
+ directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
+ }
+ catch (IOException e) {
+ throw new MarcException("error parsing leader with data: "
+ + new String(byteArray), e);
+ }
+ catch (MarcException e) {
+ throw new MarcException("error parsing leader with data: "
+ + new String(byteArray), e);
+ }
- byte[] tag = new byte[3];
- byte[] length = new byte[4];
- byte[] start = new byte[5];
+ // if MARC 21 then check encoding
+ switch (ldr.getCharCodingScheme()) {
+ case ' ':
+ if (!override)
+ encoding = "ISO-8859-1";
+ break;
+ case 'a':
+ if (!override)
+ encoding = "UTF8";
+ }
+ record.setLeader(ldr);
+
+ if ((directoryLength % 12) != 0)
+ {
+ throw new MarcException("invalid directory");
+ }
+ DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf));
+ int size = directoryLength / 12;
- String tmp;
+ String[] tags = new String[size];
+ int[] lengths = new int[size];
- for (int i = 0; i < size; i++) {
- bytesRead = input.read(tag);
+ byte[] tag = new byte[3];
+ byte[] length = new byte[4];
+ byte[] start = new byte[5];
- while (bytesRead != -1 && bytesRead != tag.length)
- bytesRead += input.read(tag, bytesRead, tag.length
- - bytesRead);
+ String tmp;
+ try {
+ for (int i = 0; i < size; i++)
+ {
+ inputrec.readFully(tag);
tmp = new String(tag);
tags[i] = tmp;
-
- bytesRead = input.read(length);
-
- while (bytesRead != -1 && bytesRead != length.length)
- bytesRead += input.read(length, bytesRead, length.length
- - bytesRead);
-
+
+ inputrec.readFully(length);
tmp = new String(length);
lengths[i] = Integer.parseInt(tmp);
-
- bytesRead = input.read(start);
-
- while (bytesRead != -1 && bytesRead != start.length)
- bytesRead += input.read(start, bytesRead, start.length
- - bytesRead);
+
+ inputrec.readFully(start);
}
-
- if (input.read() != Constants.FT)
- throw new MarcException(
- "expected field terminator at end of directory");
-
- for (int i = 0; i < size; i++) {
- if (Verifier.isControlField(tags[i])) {
+
+ if (inputrec.read() != Constants.FT)
+ {
+ throw new MarcException("expected field terminator at end of directory");
+ }
+
+ for (int i = 0; i < size; i++)
+ {
+ int fieldLength = getFieldLength(inputrec);
+ if (Verifier.isControlField(tags[i]))
+ {
byteArray = new byte[lengths[i] - 1];
- bytesRead = input.read(byteArray);
-
- while (bytesRead != -1 && bytesRead != byteArray.length)
- bytesRead += input.read(byteArray, bytesRead,
- byteArray.length - bytesRead);
-
- if (input.read() != Constants.FT)
- throw new MarcException(
- "expected field terminator at end of field");
-
+ inputrec.readFully(byteArray);
+
+ if (inputrec.read() != Constants.FT)
+ {
+ throw new MarcException("expected field terminator at end of field");
+ }
+
ControlField field = factory.newControlField();
field.setTag(tags[i]);
field.setData(getDataAsString(byteArray));
record.addVariableField(field);
-
- } else {
+ }
+ else
+ {
byteArray = new byte[lengths[i]];
- bytesRead = input.read(byteArray);
-
- while (bytesRead != -1 && bytesRead != byteArray.length)
- bytesRead += input.read(byteArray, bytesRead,
- byteArray.length - bytesRead);
-
+ inputrec.readFully(byteArray);
+
try {
- record.addVariableField(parseDataField(tags[i],
- byteArray));
+ record.addVariableField(parseDataField(tags[i], byteArray));
} catch (IOException e) {
throw new MarcException(
"error parsing data field for tag: " + tags[i]
@@ -235,14 +247,16 @@ record = factory.newRecord();
}
}
}
-
- if (input.read() != Constants.RT)
+
+ if (inputrec.read() != Constants.RT)
+ {
throw new MarcException("expected record terminator");
-
- } catch (IOException e) {
- throw new MarcException("an error occured reading input", e);
+ }
+ }
+ catch (IOException e)
+ {
+ throw new MarcException("an error occured reading input", e);
}
- return record;
}
private DataField parseDataField(String tag, byte[] field)
@@ -286,6 +300,25 @@ private DataField parseDataField(String tag, byte[] field)
}
return dataField;
}
+
+ private int getFieldLength(DataInputStream bais) throws IOException
+ {
+ bais.mark(9999);
+ int bytesRead = 0;
+ while (true) {
+ switch (bais.read()) {
+ case Constants.FT:
+ bais.reset();
+ return bytesRead;
+ case -1:
+ bais.reset();
+ throw new IOException("Field not terminated");
+ case Constants.US:
+ default:
+ bytesRead++;
+ }
+ }
+ }
private int getSubfieldLength(ByteArrayInputStream bais) throws IOException {
bais.mark(9999);
@@ -305,63 +338,89 @@ private int getSubfieldLength(ByteArrayInputStream bais) throws IOException {
}
}
- private Leader parseLeader(byte[] leaderData) throws IOException {
+ private int parseRecordLength(byte[] leaderData) throws IOException {
InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
leaderData));
- Leader ldr = factory.newLeader();
+ int length = -1;
char[] tmp = new char[5];
isr.read(tmp);
try {
- ldr.setRecordLength(Integer.parseInt(new String(tmp)));
+ length = Integer.parseInt(new String(tmp));
} catch (NumberFormatException e) {
throw new MarcException("unable to parse record length", e);
}
+ return(length);
+ }
+
+ private void parseLeader(Leader ldr, byte[] leaderData) throws IOException {
+ InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
+ leaderData));
+ char[] tmp = new char[5];
+ isr.read(tmp);
+ // Skip over bytes for record length, If we get here, its already been computed.
ldr.setRecordStatus((char) isr.read());
ldr.setTypeOfRecord((char) isr.read());
tmp = new char[2];
isr.read(tmp);
ldr.setImplDefined1(tmp);
ldr.setCharCodingScheme((char) isr.read());
+ char indicatorCount = (char) isr.read();
+ char subfieldCodeLength = (char) isr.read();
+ char baseAddr[] = new char[5];
+ isr.read(baseAddr);
+ tmp = new char[3];
+ isr.read(tmp);
+ ldr.setImplDefined2(tmp);
+ tmp = new char[4];
+ isr.read(tmp);
+ ldr.setEntryMap(tmp);
+ isr.close();
try {
- ldr.setIndicatorCount(Integer.parseInt(String.valueOf((char) isr
- .read())));
+ ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount)));
} catch (NumberFormatException e) {
throw new MarcException("unable to parse indicator count", e);
}
try {
ldr.setSubfieldCodeLength(Integer.parseInt(String
- .valueOf((char) isr.read())));
+ .valueOf(subfieldCodeLength)));
} catch (NumberFormatException e) {
throw new MarcException("unable to parse subfield code length", e);
}
- tmp = new char[5];
- isr.read(tmp);
try {
- ldr.setBaseAddressOfData(Integer.parseInt(new String(tmp)));
+ ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr)));
} catch (NumberFormatException e) {
throw new MarcException("unable to parse base address of data", e);
}
- tmp = new char[3];
- isr.read(tmp);
- ldr.setImplDefined2(tmp);
- tmp = new char[4];
- isr.read(tmp);
- ldr.setEntryMap(tmp);
- isr.close();
- return ldr;
+
}
- private String getDataAsString(byte[] bytes) {
+ private String getDataAsString(byte[] bytes)
+ {
String dataElement = null;
- if (encoding != null)
+ if (encoding.equals("UTF-8") || encoding.equals("UTF8"))
+ {
+ try {
+ dataElement = new String(bytes, "UTF8");
+ }
+ catch (UnsupportedEncodingException e) {
+ throw new MarcException("unsupported encoding", e);
+ }
+ }
+ else if (encoding.equals("MARC-8") || encoding.equals("MARC8"))
+ {
+ if (converterAnsel == null) converterAnsel = new AnselToUnicode();
+ dataElement = converterAnsel.convert(bytes);
+ }
+ else if (encoding.equals("ISO-8859-1") || encoding.equals("ISO8859_1"))
+ {
try {
- dataElement = new String(bytes, encoding);
- } catch (UnsupportedEncodingException e) {
+ dataElement = new String(bytes, "ISO-8859-1");
+ }
+ catch (UnsupportedEncodingException e) {
throw new MarcException("unsupported encoding", e);
}
- else
- dataElement = new String(bytes);
+ }
return dataElement;
}
-
+
}
\ No newline at end of file
diff --git a/src/org/marc4j/MarcXmlParserThread.java b/src/org/marc4j/MarcXmlParserThread.java
index fc7c4ced..5c98b5f4 100644
--- a/src/org/marc4j/MarcXmlParserThread.java
+++ b/src/org/marc4j/MarcXmlParserThread.java
@@ -1,4 +1,4 @@
-// $Id: MarcXmlParserThread.java,v 1.2 2006/05/20 09:25:46 bpeters Exp $
+// $Id: MarcXmlParserThread.java,v 1.3 2008/09/26 21:17:42 haschart Exp $
/**
* Copyright (C) 2004 Bas Peters
*
@@ -30,7 +30,7 @@
* MARCXML data.
*
* @author Bas Peters
- * @version $Revision: 1.2 $
+ * @version $Revision: 1.3 $
*/
public class MarcXmlParserThread extends Thread {
@@ -115,7 +115,12 @@ public void run() {
parser.parse(input);
else
parser.parse(input, th);
- } finally {
+ }
+ catch (MarcException me)
+ {
+ queue.passException(me);
+ }
+ finally {
queue.end();
}
}
diff --git a/src/org/marc4j/RecordStack.java b/src/org/marc4j/RecordStack.java
index a0a72699..97ff7773 100644
--- a/src/org/marc4j/RecordStack.java
+++ b/src/org/marc4j/RecordStack.java
@@ -1,4 +1,4 @@
-// $Id: RecordStack.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $
+// $Id: RecordStack.java,v 1.2 2008/09/26 21:17:42 haschart Exp $
/**
* Copyright (C) 2004 Bas Peters
*
@@ -31,12 +31,12 @@
* Record objects created by MarcXmlParser.
*
* @author Bas Peters
- * @version $Revision: 1.1 $
+ * @version $Revision: 1.2 $
*/
public class RecordStack {
private List list;
-
+ private RuntimeException re = null;
private boolean eof = false;
/**
@@ -77,6 +77,7 @@ public synchronized Record pop() {
} catch (Exception e) {
}
}
+ if (re != null) throw(re);
Record record = null;
if (list.size() > 0)
record = (Record) list.remove(0);
@@ -98,12 +99,23 @@ public synchronized boolean hasNext() {
} catch (Exception e) {
}
}
-
+ if (re != null) throw(re);
if (!isEmpty() || !eof)
return true;
return false;
}
+ /**
+ * Passes the exception to the thread where the MarcXMLReader is running, so that the next() call
+ * that is blocked waiting for this thread, will receive the exception.
+ *
+ */
+ public synchronized void passException(RuntimeException e) {
+ re = e;
+ eof = true;
+ notifyAll();
+ }
+
/**
* Called when the end of the document is reached.
*
diff --git a/src/org/marc4j/converter/CharConverter.java b/src/org/marc4j/converter/CharConverter.java
index e04b94cb..74d4e2f7 100644
--- a/src/org/marc4j/converter/CharConverter.java
+++ b/src/org/marc4j/converter/CharConverter.java
@@ -1,4 +1,4 @@
-//$Id: CharConverter.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $
+//$Id: CharConverter.java,v 1.2 2008/09/26 21:18:16 haschart Exp $
/**
* Copyright (C) 2005 Bas Peters
*
@@ -25,9 +25,9 @@
* Implement this class to create a character converter.
*
* @author Bas Peters
- * @version $Revision: 1.1 $
+ * @version $Revision: 1.2 $
*/
-public interface CharConverter {
+public abstract class CharConverter {
/**
* Converts the dataElement and returns the result as a String
@@ -36,6 +36,25 @@ public interface CharConverter {
* @param dataElement the data to convert
* @return String the conversion result
*/
- public String convert(String dataElement);
+ public abstract String convert(char[] dataElement);
+
+ public String convert(byte[] data)
+ {
+ char cData[] = new char[data.length];
+ for (int i = 0; i < data.length; i++)
+ {
+ byte b = data[i];
+ cData[i] = (char)(b >= 0 ? b : 256 + b);
+ }
+ return convert(cData);
+ }
+
+ public String convert(String dataElement)
+ {
+ char[] data = null;
+ data = dataElement.toCharArray();
+ return (convert(data));
+ }
+
}
\ No newline at end of file
diff --git a/src/org/marc4j/converter/impl/AnselToUnicode.java b/src/org/marc4j/converter/impl/AnselToUnicode.java
index 6a0b3754..c54b29b3 100644
--- a/src/org/marc4j/converter/impl/AnselToUnicode.java
+++ b/src/org/marc4j/converter/impl/AnselToUnicode.java
@@ -1,4 +1,4 @@
-// $Id: AnselToUnicode.java,v 1.3 2005/12/14 17:11:30 bpeters Exp $
+// $Id: AnselToUnicode.java,v 1.4 2008/09/26 21:17:42 haschart Exp $
/**
* Copyright (C) 2002 Bas Peters (mail@bpeters.com)
*
@@ -21,8 +21,11 @@
package org.marc4j.converter.impl;
import java.io.InputStream;
+import java.lang.reflect.Constructor;
import java.util.Vector;
+import org.marc4j.ErrorHandler;
+import org.marc4j.MarcException;
import org.marc4j.converter.CharConverter;
/**
@@ -37,9 +40,9 @@
*
* @author Bas Peters
* @author Corey Keith
- * @version $Revision: 1.3 $
+ * @version $Revision: 1.4 $
*/
-public class AnselToUnicode implements CharConverter {
+public class AnselToUnicode extends CharConverter {
class Queue extends Vector {
@@ -101,20 +104,80 @@ public String toString() {
}
}
- protected CodeTable ct;
+ protected CodeTableInterface ct;
protected boolean loadedMultibyte = false;
+ protected ErrorHandler errorList = null;
/**
* Creates a new instance and loads the MARC4J supplied
* conversion tables based on the official LC tables.
*
*/
- public AnselToUnicode() {
- this(AnselToUnicode.class
- .getResourceAsStream("resources/codetablesnocjk.xml"));
+ public AnselToUnicode()
+ {
+ ct = loadGeneratedTable(false);
}
+
+ /**
+ * Creates a new instance and loads the MARC4J supplied
+ * conversion tables based on the official LC tables.
+ *
+ */
+ public AnselToUnicode(boolean loadMultibyte)
+ {
+ ct = loadGeneratedTable(loadMultibyte);
+ }
+ /**
+ * Creates a new instance and loads the MARC4J supplied
+ * conversion tables based on the official LC tables.
+ *
+ */
+ public AnselToUnicode(ErrorHandler errorList)
+ {
+ ct = loadGeneratedTable(false);
+ this.errorList = errorList;
+ }
+
+ /**
+ * Creates a new instance and loads the MARC4J supplied
+ * conversion tables based on the official LC tables.
+ *
+ */
+ public AnselToUnicode(ErrorHandler errorList, boolean loadMultibyte)
+ {
+ ct = loadGeneratedTable(loadMultibyte);
+ this.errorList = errorList;
+ }
+
+ private CodeTableInterface loadGeneratedTable(boolean loadMultibyte)
+ {
+ try
+ {
+ Class generated = Class.forName("org.marc4j.converter.impl.CodeTableGenerated");
+ Constructor cons = generated.getConstructor();
+ Object ct = cons.newInstance();
+ loadedMultibyte = true;
+ return((CodeTableInterface)ct);
+ }
+ catch (Exception e)
+ {
+ CodeTableInterface ct;
+ if (loadMultibyte)
+ {
+ ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetables.xml"));
+ }
+ else
+ {
+ ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetablesnocjk.xml"));
+ }
+ loadedMultibyte = loadMultibyte;
+ return(ct);
+ }
+
+ }
+
/**
* Constructs an instance with the specified pathname.
*
@@ -144,7 +207,7 @@ public AnselToUnicode(InputStream in) {
}
/**
- * Loads the entire maping (including multibyte characters) from the Library
+ * Loads the entire mapping (including multibyte characters) from the Library
* of Congress.
*/
private void loadMultibyte() {
@@ -153,58 +216,146 @@ private void loadMultibyte() {
}
private void checkMode(char[] data, CodeTracker cdt) {
- while (cdt.offset < data.length && isEscape(data[cdt.offset])) {
- switch (data[cdt.offset + 1]) {
- case 0x28:
- case 0x2c:
- cdt.g0 = data[cdt.offset + 2];
- cdt.offset += 3;
- cdt.multibyte = false;
+ int extra = 0;
+ int extra2 = 0;
+ int extra3 = 0;
+ while (cdt.offset + extra + extra2< data.length && isEscape(data[cdt.offset])) {
+ switch (data[cdt.offset + 1 + extra]) {
+ case 0x28: // '('
+ case 0x2c: // ','
+ set_cdt(cdt, 0, data, 2 + extra, false);
break;
- case 0x29:
- case 0x2d:
- cdt.g1 = data[cdt.offset + 2];
- cdt.offset += 3;
- cdt.multibyte = false;
+ case 0x29: // ')'
+ case 0x2d: // '-'
+ set_cdt(cdt, 1, data, 2 + extra, false);
break;
- case 0x24:
- cdt.multibyte = true;
+ case 0x24: // '$'
if (!loadedMultibyte) {
loadMultibyte();
loadedMultibyte = true;
}
- switch (data[cdt.offset + 1]) {
- case 0x29:
- case 0x2d:
- cdt.g1 = data[cdt.offset + 3];
- cdt.offset += 4;
+ switch (data[cdt.offset + 2 + extra + extra2]) {
+ case 0x29: // ')'
+ case 0x2d: // '-'
+ set_cdt(cdt, 1, data, 3 + extra + extra2, true);
+ break;
+ case 0x2c: // ','
+ set_cdt(cdt, 0, data, 3 + extra + extra2, true);
break;
- case 0x2c:
- cdt.g0 = data[cdt.offset + 3];
- cdt.offset += 4;
+ case 0x31: // '1'
+ cdt.g0 = data[cdt.offset + 2 + extra + extra2];
+ cdt.offset += 3 + extra + extra2;
+ cdt.multibyte = true;
break;
- default:
- cdt.g0 = data[cdt.offset + 2];
- cdt.offset += 3;
+ case 0x20: // ' '
+ // space found in escape code: look ahead and try to proceed
+ extra2++;
+ break;
+ default:
+ // unknown code character found: discard escape sequence and return
+ cdt.offset += 1;
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
+ }
+ else
+ {
+ throw new MarcException("Unknown character set code found following escape character.");
+ }
break;
}
break;
- case 0x67:
- case 0x62:
- case 0x70:
- cdt.g0 = data[cdt.offset + 1];
- cdt.offset += 2;
+ case 0x67: // 'g'
+ case 0x62: // 'b'
+ case 0x70: // 'p'
+ cdt.g0 = data[cdt.offset + 1 + extra];
+ cdt.offset += 2 + extra;
cdt.multibyte = false;
break;
- case 0x73:
+ case 0x73: // 's'
cdt.g0 = 0x42;
- cdt.offset += 2;
+ cdt.offset += 2 + extra;
cdt.multibyte = false;
break;
+ case 0x20: // ' '
+ // space found in escape code: look ahead and try to proceed
+ if (errorList == null)
+ {
+ throw new MarcException("Extraneous space character found within MARC8 character set escape sequence");
+ }
+ extra++;
+ break;
+ default:
+ // unknown code character found: discard escape sequence and return
+ cdt.offset += 1;
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
+ }
+ else
+ {
+ throw new MarcException("Unknown character set code found following escape character.");
+ }
+ break;
}
}
+ if (errorList != null && ( extra != 0 || extra2 != 0))
+ {
+ errorList.addError(ErrorHandler.MINOR_ERROR, "" + (extra+extra2) + " extraneous space characters found within MARC8 character set escape sequence");
+ }
}
+ private void set_cdt(CodeTracker cdt, int g0_or_g1, char[] data, int addnlOffset, boolean multibyte)
+ {
+ if (data[cdt.offset + addnlOffset] == '!' && data[cdt.offset + addnlOffset + 1] == 'E')
+ {
+ addnlOffset++;
+ }
+ else if (data[cdt.offset + addnlOffset] == ' ')
+ {
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.MINOR_ERROR, "Extraneous space character found within MARC8 character set escape sequence. Skipping over space.");
+ }
+ else
+ {
+ throw new MarcException("Extraneous space character found within MARC8 character set escape sequence");
+ }
+ addnlOffset++;
+ }
+ else if ("(,)-$!".indexOf(data[cdt.offset + addnlOffset]) != -1)
+ {
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.MINOR_ERROR, "Extraneaous intermediate character found following escape character. Discarding intermediate character.");
+ }
+ else
+ {
+ throw new MarcException("Extraneaous intermediate character found following escape character.");
+ }
+ addnlOffset++;
+ }
+ if ("34BE1NQS2".indexOf(data[cdt.offset + addnlOffset]) == -1)
+ {
+ cdt.offset += 1;
+ cdt.multibyte = false;
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
+ }
+ else
+ {
+ throw new MarcException("Unknown character set code found following escape character.");
+ }
+ }
+ else // All is well, proceed normally
+ {
+ if (g0_or_g1 == 0) cdt.g0 = data[cdt.offset + addnlOffset];
+ else cdt.g1 = data[cdt.offset + addnlOffset];
+ cdt.offset += 1 + addnlOffset;
+ cdt.multibyte = multibyte;
+ }
+ }
/**
*
* Converts MARC-8 data to UCS/Unicode.
@@ -214,9 +365,8 @@ private void checkMode(char[] data, CodeTracker cdt) {
* the MARC-8 data
* @return String - the UCS/Unicode data
*/
- public String convert(String dataElement) {
- char[] data = null;
- data = dataElement.toCharArray();
+ public String convert(char data[])
+ {
StringBuffer sb = new StringBuffer();
int len = data.length;
@@ -232,14 +382,17 @@ public String convert(String dataElement) {
Queue diacritics = new Queue();
- while (cdt.offset < data.length) {
+ while (cdt.offset < data.length)
+ {
if (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
- && hasNext(cdt.offset, len)) {
+ && hasNext(cdt.offset, len))
+ {
while (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
- && hasNext(cdt.offset, len)) {
- diacritics.put(new Character(getChar(data[cdt.offset],
- cdt.g0, cdt.g1)));
+ && hasNext(cdt.offset, len))
+ {
+ char c = getChar(data[cdt.offset], cdt.g0, cdt.g1);
+ if (c != 0) diacritics.put(new Character(c));
cdt.offset++;
checkMode(data, cdt);
}
@@ -247,23 +400,122 @@ && hasNext(cdt.offset, len)) {
char c2 = getChar(data[cdt.offset], cdt.g0, cdt.g1);
cdt.offset++;
checkMode(data, cdt);
- sb.append(c2);
+ if (c2 != 0) sb.append(c2);
- while (!diacritics.isEmpty()) {
+ while (!diacritics.isEmpty())
+ {
char c1 = ((Character) diacritics.get()).charValue();
sb.append(c1);
}
- } else if (cdt.multibyte) {
- sb.append(ct.getChar(makeMultibyte(new String(data).substring(
- cdt.offset, cdt.offset + 4).toCharArray()), cdt.g0));
- cdt.offset += 3;
- } else {
- sb.append(getChar(data[cdt.offset], cdt.g0, cdt.g1));
+ }
+ else if (cdt.multibyte)
+ {
+ if (data[cdt.offset]== 0x20)
+ {
+ // if a 0x20 byte occurs amidst a sequence of multibyte characters
+ // skip over it and output a space.
+ // Hmmm. If the following line is present it seems to output two spaces
+ // when a space occurs in multibytes chars, without it one seems to be output.
+ // sb.append(getChar(data[cdt.offset], cdt.g0, cdt.g1));
+ cdt.offset += 1;
+ }
+ else if (cdt.offset + 3 <= data.length && (errorList == null || data[cdt.offset+1]!= 0x20 && data[cdt.offset+2]!= 0x20))
+ {
+ char c = getMBChar(makeMultibyte(data[cdt.offset], data[cdt.offset+1], data[cdt.offset+2]));
+ if (errorList == null || c != 0)
+ {
+ sb.append(c);
+ cdt.offset += 3;
+ }
+ else if (cdt.offset + 6 <= data.length && data[cdt.offset+4]!= 0x20 && data[cdt.offset+5]!= 0x20 &&
+ getMBChar(makeMultibyte(data[cdt.offset+3], data[cdt.offset+4], data[cdt.offset+5])) != 0)
+ {
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.MAJOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters");
+ sb.append("[?]");
+ cdt.offset += 3;
+ }
+ }
+ else if (cdt.offset + 4 <= data.length && data[cdt.offset] > 0x7f &&
+ getMBChar(makeMultibyte(data[cdt.offset+1], data[cdt.offset+2], data[cdt.offset+3])) != 0)
+ {
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.MAJOR_ERROR, "Erroneous character in MARC8 multibyte character, Copying bad character and continuing reading Multibyte characters");
+ sb.append(getChar(data[cdt.offset], 0x42, 0x45));
+ cdt.offset += 1;
+ }
+ }
+ else
+ {
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.MAJOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set");
+ }
+ cdt.multibyte = false;
+ cdt.g0 = 0x42;
+ cdt.g1 = 0x45;
+ }
+ }
+ else if (errorList != null && cdt.offset + 4 <= data.length && ( data[cdt.offset+1] == 0x20 || data[cdt.offset+2]== 0x20))
+ {
+ int multiByte = makeMultibyte( data[cdt.offset], ((data[cdt.offset+1] != 0x20)? data[cdt.offset+1] : data[cdt.offset+2]), data[cdt.offset+3]);
+ char c = getMBChar(multiByte);
+ if (c != 0)
+ {
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.ERROR_TYPO, "Extraneous space found within MARC8 multibyte character");
+ }
+ sb.append(c);
+ sb.append(' ');
+ cdt.offset += 4;
+ }
+ else
+ {
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.MAJOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set");
+ }
+ cdt.multibyte = false;
+ cdt.g0 = 0x42;
+ cdt.g1 = 0x45;
+ }
+ }
+ else if (cdt.offset + 3 > data.length)
+ {
+ if (errorList != null)
+ {
+ errorList.addError(ErrorHandler.MAJOR_ERROR, "Partial MARC8 multibyte character, inserting change to default character set");
+ cdt.multibyte = false;
+ cdt.g0 = 0x42;
+ cdt.g1 = 0x45;
+ }
+ // if a field ends with an incomplete encoding of a multibyte character
+ // simply discard that final partial character.
+ else
+ {
+ cdt.offset += 3;
+ }
+ }
+ }
+ else
+ {
+ char c = getChar(data[cdt.offset], cdt.g0, cdt.g1);
+ if (c != 0) sb.append(c);
+ else
+ {
+ String val = "0000"+Integer.toHexString((int)(data[cdt.offset]));
+ sb.append("" );
+ }
cdt.offset += 1;
}
if (hasNext(cdt.offset, len))
+ {
checkMode(data, cdt);
+ }
}
return sb.toString();
}
@@ -275,6 +527,15 @@ private int makeMultibyte(char[] data) {
chars[2] = data[2];
return chars[0] | chars[1] | chars[2];
}
+
+ public int makeMultibyte(char c1, char c2, char c3)
+ {
+ int[] chars = new int[3];
+ chars[0] = c1 << 16;
+ chars[1] = c2 << 8;
+ chars[2] = c3;
+ return chars[0] | chars[1] | chars[2];
+ }
private char getChar(int ch, int g0, int g1) {
if (ch <= 0x7E)
@@ -283,7 +544,7 @@ private char getChar(int ch, int g0, int g1) {
return ct.getChar(ch, g1);
}
- private char getMBChar(int ch) {
+ public char getMBChar(int ch) {
return ct.getChar(ch, 0x31);
}
diff --git a/src/org/marc4j/converter/impl/CodeTable.java b/src/org/marc4j/converter/impl/CodeTable.java
index 586efd30..ed125ec4 100644
--- a/src/org/marc4j/converter/impl/CodeTable.java
+++ b/src/org/marc4j/converter/impl/CodeTable.java
@@ -1,4 +1,4 @@
-// $Id: CodeTable.java,v 1.2 2005/12/14 17:11:30 bpeters Exp $
+// $Id: CodeTable.java,v 1.3 2008/09/26 21:17:42 haschart Exp $
/**
* Copyright (C) 2002 Bas Peters
*
@@ -23,8 +23,11 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
+import java.io.PrintStream;
import java.net.URI;
-import java.util.Hashtable;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
import java.util.Vector;
import javax.xml.parsers.SAXParser;
@@ -41,21 +44,21 @@
*
*
* @author Corey Keith
- * @version $Revision: 1.2 $
+ * @version $Revision: 1.3 $
*
*/
-public class CodeTable {
- protected static Hashtable charsets = null;
+public class CodeTable implements CodeTableInterface {
+ protected static HashMap charsets = null;
- protected static Hashtable combining = null;
+ protected static HashMap combining = null;
public boolean isCombining(int i, int g0, int g1) {
if (i <= 0x7E) {
Vector v = (Vector) combining.get(new Integer(g0));
- return v.contains(new Integer(i));
+ return (v != null && v.contains(new Integer(i)));
} else {
Vector v = (Vector) combining.get(new Integer(g1));
- return v.contains(new Integer(i));
+ return (v != null && v.contains(new Integer(i)));
}
}
@@ -63,7 +66,7 @@ public char getChar(int c, int mode) {
if (c == 0x20)
return (char) c;
else {
- Hashtable charset = (Hashtable) charsets.get(new Integer(mode));
+ HashMap charset = (HashMap) charsets.get(new Integer(mode));
if (charset == null) {
// System.err.println("Hashtable not found: "
@@ -72,18 +75,13 @@ public char getChar(int c, int mode) {
} else {
Character ch = (Character) charset.get(new Integer(c));
if (ch == null) {
-
- int newc;
- if (c < 0x80)
- newc = c + 0x80;
- else
- newc = c - 0x80;
+ int newc = (c < 0x80) ? c + 0x80 : c - 0x80;
ch = (Character) charset.get(new Integer(newc));
if (ch == null) {
// System.err.println("Character not found: "
// + Integer.toHexString(c) + " in Code Table: "
// + Integer.toHexString(mode));
- return (char) c;
+ return (char) 0;
} else
return ch.charValue();
} else
@@ -92,6 +90,144 @@ public char getChar(int c, int mode) {
}
}
+ public void dumpTableAsSwitchStatement(PrintStream output)
+ {
+ output.println("package org.marc4j.converter.impl;");
+ output.println("");
+ output.println("// Warning: This file is generated by running the main routine in the file CodeTable.java ");
+ output.println("// Warning: Do not edit this file, or all edits will be lost at the next build. ");
+ output.println("public class CodeTableGenerated implements CodeTableInterface {");
+ output.println("\tpublic boolean isCombining(int i, int g0, int g1) {");
+ output.println("\t\tswitch (i <= 0x7E ? g0 : g1) {");
+ Object combiningKeys[] = combining.keySet().toArray();
+ Arrays.sort(combiningKeys);
+ for (int combiningSel = 0; combiningSel < combiningKeys.length; combiningSel++)
+ {
+ Integer nextKey = (Integer)combiningKeys[combiningSel];
+ output.println("\t\t\tcase 0x"+Integer.toHexString(nextKey)+":");
+ Vector v = (Vector) combining.get(nextKey);
+ Iterator vIter = v.iterator();
+ if (vIter.hasNext())
+ {
+ output.println("\t\t\t\tswitch(i) {");
+ while (vIter.hasNext())
+ {
+ Integer vVal = (Integer)vIter.next();
+ output.println("\t\t\t\t\tcase 0x"+Integer.toHexString(vVal)+":");
+ }
+ output.println("\t\t\t\t\t\treturn(true);");
+ output.println("\t\t\t\t\tdefault:");
+ output.println("\t\t\t\t\t\treturn(false);");
+ output.println("\t\t\t\t}");
+ }
+ else
+ {
+ output.println("\t\t\t\treturn(false);");
+ }
+ }
+ output.println("\t\t\tdefault:");
+ output.println("\t\t\t\treturn(false);");
+ output.println("\t\t\t}");
+ output.println("\t}");
+ output.println("");
+ output.println("\tpublic char getChar(int c, int mode) {");
+ output.println("\t\tint code = getCharCode(c, mode);");
+ output.println("\t\tif (code == -1) return((char)0);");
+ output.println("\t\tif (code != 0) return((char)code);");
+ output.println("\t\tcode = getCharCode(c < 0x80 ? c + 0x80 : c - 0x80 , mode);");
+ output.println("\t\treturn((char)code);");
+ output.println("\t}");
+ output.println("");
+ output.println("\tprivate int getCharCode(int c, int mode) {");
+ output.println("\t\tif (c == 0x20) return c;");
+ output.println("\t\tswitch (mode) {");
+ Object charsetsKeys[] = charsets.keySet().toArray();
+ Arrays.sort(charsetsKeys);
+ for (int charsetSel = 0; charsetSel < charsetsKeys.length; charsetSel++)
+ {
+ Integer nextKey = (Integer)charsetsKeys[charsetSel];
+ output.println("\t\t\tcase 0x"+Integer.toHexString(nextKey)+":");
+ if (nextKey.intValue() == 0x31)
+ {
+ output.println("\t\t\t\treturn(getMultiByteChar(c));");
+ }
+ else
+ {
+ HashMap map = (HashMap) charsets.get(nextKey);
+ Object keyArray[] = map.keySet().toArray();
+ Arrays.sort(keyArray);
+ output.println("\t\t\t\tswitch(c) {");
+ for (int sel = 0; sel < keyArray.length; sel++)
+ {
+ Integer mKey = (Integer)keyArray[sel];
+ Character c = (Character)map.get(mKey);
+ if (c != null)
+ output.println("\t\t\t\t\tcase 0x"+Integer.toHexString(mKey)+": return(0x"+Integer.toHexString((int)c.charValue())+"); ");
+ else
+ output.println("\t\t\t\t\tcase 0x"+Integer.toHexString(mKey)+": return(0); ");
+ }
+ output.println("\t\t\t\t\tdefault: return(0);");
+ output.println("\t\t\t\t}");
+ }
+ }
+ output.println("\t\t\tdefault: return(-1); // unknown charset specified ");
+ output.println("\t\t}");
+ output.println("\t}");
+ output.println("");
+ StringBuffer getMultiByteFunc = new StringBuffer();
+ getMultiByteFunc.append("\tpublic int getMultiByteChar(int c) {\n");
+
+ HashMap map = (HashMap) charsets.get(new Integer(0x31));
+ Object keyArray[] = map.keySet().toArray();
+ Arrays.sort(keyArray);
+
+ dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x210000, 0x214fff);
+ dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x215000, 0x21ffff);
+// dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x210000, 0x21ffff);
+ dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x220000, 0x22ffff);
+ dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x230000, 0x27ffff);
+ dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x280000, 0x7f7fff);
+
+ getMultiByteFunc.append("\t\treturn(0);\n");
+ getMultiByteFunc.append("\t}");
+ output.println(getMultiByteFunc.toString());
+
+ output.println("}");
+
+ }
+
+ public void dumpPartialMultiByteTable(PrintStream output, StringBuffer buffer, Object keyArray[], HashMap map, int startByte, int endByte)
+ {
+ String startByteStr = "0x"+Integer.toHexString(startByte);
+ String endByteStr = "0x"+Integer.toHexString(endByte);
+ buffer.append("\t\tif (c >= "+startByteStr+" && c <= "+endByteStr+") return (getMultiByteChar_"+startByteStr+"_"+endByteStr+"(c));\n");
+
+ output.println("\tpublic char getMultiByteChar_"+startByteStr+"_"+endByteStr+"(int c) {");
+ output.println("\t\tswitch(c) {");
+ for (int sel = 0; sel < keyArray.length; sel++)
+ {
+ Integer mKey = (Integer)keyArray[sel];
+ Character c = (Character)map.get(mKey);
+ if (mKey >= startByte && mKey <= endByte)
+ {
+ if (c != null)
+ output.println("\t\t\tcase 0x"+Integer.toHexString(mKey)+": return((char)0x"+Integer.toHexString((int)c.charValue())+"); ");
+ else
+ output.println("\t\t\tcase 0x"+Integer.toHexString(mKey)+": return((char)0); ");
+ }
+ }
+ output.println("\t\t\tdefault: return((char)0);");
+ output.println("\t\t}");
+ output.println("\t}");
+ output.println("");
+ }
+
+ public static void main(String args[])
+ {
+ CodeTable ct = new CodeTable(CodeTable.class.getResourceAsStream("resources/codetables.xml"));
+ ct.dumpTableAsSwitchStatement(System.out);
+ }
+
public CodeTable(InputStream byteStream) {
try {
diff --git a/src/org/marc4j/converter/impl/CodeTableHandler.java b/src/org/marc4j/converter/impl/CodeTableHandler.java
index 3ea843dc..9a9a4321 100644
--- a/src/org/marc4j/converter/impl/CodeTableHandler.java
+++ b/src/org/marc4j/converter/impl/CodeTableHandler.java
@@ -1,4 +1,4 @@
-// $Id: CodeTableHandler.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $
+// $Id: CodeTableHandler.java,v 1.2 2008/09/26 21:17:42 haschart Exp $
/**
* Copyright (C) 2002 Bas Peters
*
@@ -22,7 +22,7 @@
import java.io.File;
import java.io.FileInputStream;
-import java.util.Hashtable;
+import java.util.HashMap;
import java.util.Vector;
import javax.xml.parsers.SAXParser;
@@ -41,17 +41,17 @@
* builds a data structure to facilitate AnselToUnicode character conversion.
*
* @author Corey Keith
- * @version $Revision: 1.1 $
+ * @version $Revision: 1.2 $
*
* @see DefaultHandler
*/
public class CodeTableHandler extends DefaultHandler {
- private Hashtable sets;
+ private HashMap sets;
- private Hashtable charset;
+ private HashMap charset;
- private Hashtable combiningchars;
+ private HashMap combiningchars;
/** Data element identifier */
private Integer isocode;
@@ -75,11 +75,11 @@ public class CodeTableHandler extends DefaultHandler {
/** Locator object */
private Locator locator;
- public Hashtable getCharSets() {
+ public HashMap getCharSets() {
return sets;
}
- public Hashtable getCombiningChars() {
+ public HashMap getCombiningChars() {
return combiningchars;
}
@@ -98,14 +98,14 @@ public void setDocumentLocator(Locator locator) {
public void startElement(String uri, String name, String qName,
Attributes atts) throws SAXParseException {
if (name.equals("characterSet")) {
- charset = new Hashtable();
+ charset = new HashMap();
isocode = Integer.valueOf(atts.getValue("ISOcode"), 16);
combining = new Vector();
} else if (name.equals("marc"))
data = new StringBuffer();
else if (name.equals("codeTables")) {
- sets = new Hashtable();
- combiningchars = new Hashtable();
+ sets = new HashMap();
+ combiningchars = new HashMap();
} else if (name.equals("ucs"))
data = new StringBuffer();
else if (name.equals("alt"))
@@ -135,7 +135,7 @@ public void endElement(String uri, String name, String qName)
if (data.length() > 0)
ucs = new Character((char) Integer.parseInt(data.toString(), 16));
else
- useAlt = true;
+ ucs = null;
} else if (name.equals("alt")) {
if (useAlt && data.length() > 0) {
ucs = new Character((char) Integer.parseInt(data.toString(), 16));
@@ -155,7 +155,7 @@ public void endElement(String uri, String name, String qName)
}
public static void main(String[] args) {
- Hashtable charsets = null;
+ HashMap charsets = null;
try {
diff --git a/src/org/marc4j/converter/impl/CodeTableInterface.java b/src/org/marc4j/converter/impl/CodeTableInterface.java
new file mode 100644
index 00000000..b8b31bed
--- /dev/null
+++ b/src/org/marc4j/converter/impl/CodeTableInterface.java
@@ -0,0 +1,7 @@
+package org.marc4j.converter.impl;
+
+public interface CodeTableInterface
+{
+ public boolean isCombining(int i, int g0, int g1);
+ public char getChar(int c, int mode);
+};
diff --git a/src/org/marc4j/converter/impl/Iso5426ToUnicode.java b/src/org/marc4j/converter/impl/Iso5426ToUnicode.java
index 371735a3..91974521 100644
--- a/src/org/marc4j/converter/impl/Iso5426ToUnicode.java
+++ b/src/org/marc4j/converter/impl/Iso5426ToUnicode.java
@@ -1,4 +1,4 @@
-// $Id: Iso5426ToUnicode.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $
+// $Id: Iso5426ToUnicode.java,v 1.2 2008/09/26 21:17:42 haschart Exp $
/**
* Copyright (C) 2002 Bas Peters (mail@bpeters.com)
* Copyright (C) 2002 Yves Pratter (ypratter@club-internet.fr)
@@ -30,9 +30,9 @@
*
* @author Bas Peters
* @author Yves Pratter
- * @version $Revision: 1.1 $
+ * @version $Revision: 1.2 $
*/
-public class Iso5426ToUnicode implements CharConverter {
+public class Iso5426ToUnicode extends CharConverter {
/**
*
@@ -43,8 +43,7 @@ public class Iso5426ToUnicode implements CharConverter {
* the UNIMARC data
* @return {@link String}- the UCS/Unicode data
*/
- public String convert(String dataElement) {
- char[] data = dataElement.toCharArray();
+ public String convert(char data[]) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < data.length; i++) {
diff --git a/src/org/marc4j/converter/impl/Iso6937ToUnicode.java b/src/org/marc4j/converter/impl/Iso6937ToUnicode.java
index 35e938b0..21c24ccd 100644
--- a/src/org/marc4j/converter/impl/Iso6937ToUnicode.java
+++ b/src/org/marc4j/converter/impl/Iso6937ToUnicode.java
@@ -1,4 +1,4 @@
-// $Id: Iso6937ToUnicode.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $
+// $Id: Iso6937ToUnicode.java,v 1.2 2008/09/26 21:17:42 haschart Exp $
/**
* Copyright (C) 2002 Bas Peters (mail@bpeters.com)
* Copyright (C) 2002 Yves Pratter (ypratter@club-internet.fr)
@@ -30,9 +30,9 @@
*
* @author Bas Peters
* @author Yves Pratter
- * @version $Revision: 1.1 $
+ * @version $Revision: 1.2 $
*/
-public class Iso6937ToUnicode implements CharConverter {
+public class Iso6937ToUnicode extends CharConverter {
/**
*
@@ -43,8 +43,7 @@ public class Iso6937ToUnicode implements CharConverter {
* the ISO 6937 data
* @return {@link String}- the UCS/Unicode data
*/
- public String convert(String dataElement) {
- char[] data = dataElement.toCharArray();
+ public String convert(char data[]) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < data.length; i++) {
diff --git a/src/org/marc4j/converter/impl/UnicodeToAnsel.java b/src/org/marc4j/converter/impl/UnicodeToAnsel.java
index 5831bde4..774ca8ad 100644
--- a/src/org/marc4j/converter/impl/UnicodeToAnsel.java
+++ b/src/org/marc4j/converter/impl/UnicodeToAnsel.java
@@ -1,4 +1,4 @@
-// $Id: UnicodeToAnsel.java,v 1.2 2005/11/28 16:50:22 bpeters Exp $
+// $Id: UnicodeToAnsel.java,v 1.3 2008/09/26 21:17:42 haschart Exp $
/**
* Copyright (C) 2002 Bas Peters (mail@bpeters.com)
*
@@ -36,9 +36,9 @@
*
* @author Bas Peters
* @author Corey Keith
- * @version $Revision: 1.2 $
+ * @version $Revision: 1.3 $
*/
-public class UnicodeToAnsel implements CharConverter {
+public class UnicodeToAnsel extends CharConverter {
protected ReverseCodeTable rct;
static final char ESC = 0x1b;
@@ -97,8 +97,7 @@ public UnicodeToAnsel(InputStream in) {
* the UCS/Unicode data
* @return String - the MARC-8 data
*/
- public String convert(String dataElement) {
- char[] data = dataElement.toCharArray();
+ public String convert(char data[]) {
StringBuffer sb = new StringBuffer();
CodeTableTracker ctt = new CodeTableTracker();
diff --git a/src/org/marc4j/converter/impl/UnicodeToIso5426.java b/src/org/marc4j/converter/impl/UnicodeToIso5426.java
index e2bfa958..ee139579 100644
--- a/src/org/marc4j/converter/impl/UnicodeToIso5426.java
+++ b/src/org/marc4j/converter/impl/UnicodeToIso5426.java
@@ -1,4 +1,4 @@
-// $Id: UnicodeToIso5426.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $
+// $Id: UnicodeToIso5426.java,v 1.2 2008/09/26 21:17:42 haschart Exp $
/**
* Copyright (C) 2002 Bas Peters (mail@bpeters.com)
*
@@ -29,9 +29,9 @@
*
* @author Bas Peters
* @author Yves Pratter
- * @version $Revision: 1.1 $
+ * @version $Revision: 1.2 $
*/
-public class UnicodeToIso5426 implements CharConverter {
+public class UnicodeToIso5426 extends CharConverter {
/**
*
@@ -46,8 +46,7 @@ public class UnicodeToIso5426 implements CharConverter {
* the UCS/Unicode data
* @return {@link String}- the UNIMARC (ISO 5426 charset) data
*/
- public String convert(String dataElement) {
- char[] data = dataElement.toCharArray();
+ public String convert(char data[]) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < data.length; i++) {
char c = data[i];
diff --git a/src/org/marc4j/converter/impl/UnicodeToIso6937.java b/src/org/marc4j/converter/impl/UnicodeToIso6937.java
index a6f66eff..d6c5d8f0 100644
--- a/src/org/marc4j/converter/impl/UnicodeToIso6937.java
+++ b/src/org/marc4j/converter/impl/UnicodeToIso6937.java
@@ -1,4 +1,4 @@
-// $Id: UnicodeToIso6937.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $
+// $Id: UnicodeToIso6937.java,v 1.2 2008/09/26 21:17:42 haschart Exp $
/**
* Copyright (C) 2002 Bas Peters (mail@bpeters.com)
*
@@ -29,9 +29,9 @@
*
* @author Bas Peters
* @author Yves Pratter
- * @version $Revision: 1.1 $
+ * @version $Revision: 1.2 $
*/
-public class UnicodeToIso6937 implements CharConverter {
+public class UnicodeToIso6937 extends CharConverter {
/**
*