From 1dba2aab97bbc4ada2e23b7205882c58afd7825a Mon Sep 17 00:00:00 2001 From: haschart Date: Fri, 26 Sep 2008 21:17:42 +0000 Subject: [PATCH] Issue number: Changes for 2.4 release Submitted by: Robert Haschart --- build.properties | 2 +- build.xml | 18 +- changes.txt | 27 +- src/org/marc4j/ErrorHandler.java | 205 +++ src/org/marc4j/MarcDirStreamReader.java | 145 ++ .../marc4j/MarcPermissiveStreamReader.java | 1523 +++++++++++++++++ src/org/marc4j/MarcStreamReader.java | 311 ++-- src/org/marc4j/MarcXmlParserThread.java | 11 +- src/org/marc4j/RecordStack.java | 20 +- src/org/marc4j/converter/CharConverter.java | 27 +- .../marc4j/converter/impl/AnselToUnicode.java | 375 +++- src/org/marc4j/converter/impl/CodeTable.java | 168 +- .../converter/impl/CodeTableHandler.java | 26 +- .../converter/impl/CodeTableInterface.java | 7 + .../converter/impl/Iso5426ToUnicode.java | 9 +- .../converter/impl/Iso6937ToUnicode.java | 9 +- .../marc4j/converter/impl/UnicodeToAnsel.java | 9 +- .../converter/impl/UnicodeToIso5426.java | 9 +- .../converter/impl/UnicodeToIso6937.java | 9 +- src/org/marc4j/marc/impl/RecordImpl.java | 8 +- src/org/marc4j/marc/impl/Verifier.java | 16 +- .../samples/HandleExceptionExample.java | 57 + .../samples/PermissiveReaderExample.java | 229 +++ .../marc4j/samples/resources/diacritic4.mrc | 1 + src/org/marc4j/samples/resources/error.mrc | 1 + 25 files changed, 2957 insertions(+), 265 deletions(-) create mode 100644 src/org/marc4j/ErrorHandler.java create mode 100644 src/org/marc4j/MarcDirStreamReader.java create mode 100644 src/org/marc4j/MarcPermissiveStreamReader.java create mode 100644 src/org/marc4j/converter/impl/CodeTableInterface.java create mode 100644 src/org/marc4j/samples/HandleExceptionExample.java create mode 100644 src/org/marc4j/samples/PermissiveReaderExample.java create mode 100644 src/org/marc4j/samples/resources/diacritic4.mrc create mode 100644 src/org/marc4j/samples/resources/error.mrc diff --git a/build.properties b/build.properties index 5d6f38bf..2c4c6096 100644 --- a/build.properties +++ b/build.properties @@ -3,5 +3,5 @@ src.dir=src build.dir=build dist.dir=dist apidoc.dir=apidoc -version=2.3.2 +version=2.4 project.name=marc4j \ No newline at end of file diff --git a/build.xml b/build.xml index c619e3fc..22995475 100644 --- a/build.xml +++ b/build.xml @@ -27,10 +27,22 @@ + + + + - - - + + + + + + + + + + + diff --git a/changes.txt b/changes.txt index 1f567c7b..9a672dd3 100644 --- a/changes.txt +++ b/changes.txt @@ -1,9 +1,34 @@ +Changes to MARC4J 2.4 + +MARC4J 2.4 is a minor release providing some bug fixes and some new functionality. + +- Added MarcPermissiveStreamReader which is more capable of reading records that contain structural or + encoding errors, and is capable of translating the records to UTF-8 as they are read. +- Added ErrorHandler which is used for tracking and reporting structural or encoding errors + encountered by the MarcPermissiveStreamReader. +- Added MarcDirStreamReader which iterates over all of the MARC record files in a given directory. +- Modified MarcStreamReader so that if an exception is thrown for an error in one record you can + choose to catch the exception, discard the erroneous record and continue reading from the input file. +- Modified AnselToUnicode to fix some problems that would occur when trying to handle Chinese characters, + to fix an infinite loop problem that would occur sometimes when extraneous characters appear within a + MARC8 character set escape sequence, and made many changes to support the MarcPermissiveStreamReader + to report and try to recover from encoding errors in the records being read. +- Modified CodeTable (which is used by AnselToUnicode) so that rather than reading and parsing a large + XML file to create the hash tables for mapping MARC8 to Unicode at runtime, the parsing is done once + at compile time, and a class that handles the mapping directly via switch statements is automatically + generated. +- Made minor changes to the MarcXmlReader so that if an exception occurs in the MarcXmlParserThread that + it starts, the exception is passed to the MarcXmlReader rather than simply hanging the parser thread. +- Added PermissiveReaderExample which demonstrates how to use the MarcPermissiveReader to examine and/or + validate records for structural or encoding errors. + + Changes to MARC4J 2.3.1 MARC4J 2.3.1 is a minor release with some encoding fixes - Fixed encoding bug in MarcStreamReader: now sets ISO8859_1 as default as alternative for MARC-8 and - UNIMARC encoding alternative. For MARC 21 the ledare is checked: space is ISO 8859_1 and a is UTF-8. + UNIMARC encoding alternative. For MARC 21 the leader is checked: space is ISO 8859_1 and a is UTF-8. When an encoding is provided in the MarcStreamReader constructor, this encoding overrides the default encoding and the leader encoding value. - MarcXmlDriver: when converting from MARC-8 to UTF-8 character coding scheme in leader (pos. 9) is set to 'a'. diff --git a/src/org/marc4j/ErrorHandler.java b/src/org/marc4j/ErrorHandler.java new file mode 100644 index 00000000..42041075 --- /dev/null +++ b/src/org/marc4j/ErrorHandler.java @@ -0,0 +1,205 @@ +// $Id: ErrorHandler.java,v 1.6 2008/09/26 21:17:42 haschart Exp $ +/** + * Copyright (C) 2004 Bas Peters + * + * This file is part of MARC4J + * + * MARC4J is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * MARC4J is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with MARC4J; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +package org.marc4j; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * Defines and describes errors encountered in the processing a given MARC record. + * Used in conjunction with the MarcPermissiveReader class. + * + * @author Robert Haschart + * @version $Revision: 1.6 $ + */ +public class ErrorHandler { + + public final static int FATAL = 5; + public final static int MAJOR_ERROR = 4; + public final static int MINOR_ERROR = 3; + public final static int ERROR_TYPO = 2; + public final static int WARNING = 1; + public final static int INFO = 0; + + private List errors; + private String curRecordID; + private String curField; + private String curSubfield; + boolean hasMissingID; + int maxSeverity; + + public class Error { + private String curRecordID; + private String curField; + private String curSubfield; + private int severity; + private String message; + + public Error(String recordID, String field, String subfield, int severity, String message) + { + curRecordID = recordID; + curField = field; + curSubfield = subfield; + this.severity = severity; + this.message = message; + } + + public String toString() + { + String severityMsg = getSeverityMsg(severity); + String ret = severityMsg +" : " + message + " --- [ " + curField + " : " + curSubfield + " ]" ; + return(ret); + } + + public void setCurRecordID(String curRecordID) + { + this.curRecordID = curRecordID; + } + + public String getCurRecordID() + { + return(curRecordID); + } + + public int getSeverity() + { + return severity; + } + + public void setSeverity(int severity) + { + this.severity = severity; + } + } + + public ErrorHandler() + { + errors = null; + hasMissingID = false; + maxSeverity = INFO; + } + + public String getSeverityMsg(int severity) + { + switch (severity) { + case FATAL: return("FATAL "); + case MAJOR_ERROR: return("Major Error "); + case MINOR_ERROR: return("Minor Error "); + case ERROR_TYPO: return("Typo "); + case WARNING: return("Warning "); + case INFO: return("Info "); + } + return(null); + } + + public boolean hasErrors() + { + return (errors != null && errors.size() > 0 && maxSeverity > INFO); + } + + public int getMaxSeverity() + { + return (maxSeverity); + } + + public List getErrors() + { + if (errors == null || errors.size() == 0) return null; + return(errors); + } + + public void reset() + { + errors = null; + maxSeverity = INFO; + } + + public void addError(String id, String field, String subfield, int severity, String message) + { + if (errors == null) + { + errors = new LinkedList(); + hasMissingID = false; + } + if (id != null && id.equals("unknown")) hasMissingID = true; + else if (hasMissingID) + { + setRecordIDForAll(id); + } + errors.add(new Error(id, field, subfield, severity, message)); + if (severity > maxSeverity) maxSeverity = severity; + } + + public void addError(int severity, String message) + { + addError(curRecordID, curField, curSubfield, severity, message); + } + + public String getRecordID() + { + return curRecordID; + } + + private void setRecordIDForAll(String id) + { + if (id != null) + { + Iterator iter = errors.iterator(); + while (iter.hasNext()) + { + Error err = (Error)(iter.next()); + if (err.getCurRecordID() == null || err.getCurRecordID().equals("unknown")) + { + err.setCurRecordID(id); + } + } + hasMissingID = false; + } + } + + public void setRecordID(String recordID) + { + curRecordID = recordID; + if (hasMissingID && errors != null) setRecordIDForAll(recordID); + } + + public String getField() + { + return curField; + } + + public void setField(String curField) + { + this.curField = curField; + } + + public String getCurSubfield() + { + return curSubfield; + } + + public void setCurSubfield(String curSubfield) + { + this.curSubfield = curSubfield; + } +} diff --git a/src/org/marc4j/MarcDirStreamReader.java b/src/org/marc4j/MarcDirStreamReader.java new file mode 100644 index 00000000..d8c02723 --- /dev/null +++ b/src/org/marc4j/MarcDirStreamReader.java @@ -0,0 +1,145 @@ +// $Id: MarcDirStreamReader.java,v 1.1 2008/09/26 21:17:42 haschart Exp $ +/** + * Copyright (C) 2004 Bas Peters + * + * This file is part of MARC4J + * + * MARC4J is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * MARC4J is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with MARC4J; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +package org.marc4j; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FilenameFilter; + +import org.marc4j.MarcPermissiveStreamReader; +import org.marc4j.MarcReader; +import org.marc4j.marc.Record; + +/** + * + * @author Robert Haschart + * @version $Id: MarcDirStreamReader.java,v 1.1 2008/09/26 21:17:42 haschart Exp $ + * + */ +public class MarcDirStreamReader implements MarcReader +{ + File list[]; + MarcReader curFileReader; + int curFileNum; + boolean permissive; + boolean convertToUTF8; + String defaultEncoding; + + public MarcDirStreamReader(String dirName) + { + File dir = new File(dirName); + init(dir, false, false, null); + } + + public MarcDirStreamReader(File dir) + { + init(dir, false, false, null); + } + + public MarcDirStreamReader(String dirName, boolean permissive, boolean convertToUTF8) + { + File dir = new File(dirName); + init(dir, permissive, convertToUTF8, null); + } + + public MarcDirStreamReader(File dir, boolean permissive, boolean convertToUTF8) + { + init(dir, permissive, convertToUTF8, null); + } + + public MarcDirStreamReader(String dirName, boolean permissive, boolean convertToUTF8, String defaultEncoding) + { + File dir = new File(dirName); + init(dir, permissive, convertToUTF8, defaultEncoding); + } + + public MarcDirStreamReader(File dir, boolean permissive, boolean convertToUTF8, String defaultEncoding) + { + init(dir, permissive, convertToUTF8, defaultEncoding); + } + + private void init(File dir, boolean permissive, boolean convertToUTF8, String defaultEncoding) + { + FilenameFilter filter = new FilenameFilter() + { + public boolean accept(File dir, String name) + { + return(name.endsWith("mrc")); + } + }; + this.permissive = permissive; + this.convertToUTF8 = convertToUTF8; + list = dir.listFiles(filter); + java.util.Arrays.sort(list); + curFileNum = 0; + curFileReader = null; + this.defaultEncoding = defaultEncoding; + } + + public boolean hasNext() + { + if (curFileReader == null || curFileReader.hasNext() == false) + { + nextFile(); + } + return (curFileReader == null ? false : curFileReader.hasNext()); + } + + private void nextFile() + { + if (curFileNum != list.length) + { + try + { + System.err.println("Switching to input file: "+ list[curFileNum]); + if (defaultEncoding != null) + { + curFileReader = new MarcPermissiveStreamReader(new FileInputStream(list[curFileNum++]), permissive, convertToUTF8, defaultEncoding); + } + else + { + curFileReader = new MarcPermissiveStreamReader(new FileInputStream(list[curFileNum++]), permissive, convertToUTF8); + } + } + catch (FileNotFoundException e) + { + nextFile(); + } + } + else + { + curFileReader = null; + } + } + + public Record next() + { + if (curFileReader == null || curFileReader.hasNext() == false) + { + nextFile(); + } + return (curFileReader == null ? null : curFileReader.next()); + } + +} diff --git a/src/org/marc4j/MarcPermissiveStreamReader.java b/src/org/marc4j/MarcPermissiveStreamReader.java new file mode 100644 index 00000000..de76846d --- /dev/null +++ b/src/org/marc4j/MarcPermissiveStreamReader.java @@ -0,0 +1,1523 @@ +/** + * Copyright (C) 2004 Bas Peters + * + * This file is part of MARC4J + * + * MARC4J is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * MARC4J is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with MARC4J; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.marc4j; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.marc4j.Constants; +import org.marc4j.MarcException; +import org.marc4j.MarcReader; +import org.marc4j.converter.CharConverter; +import org.marc4j.converter.impl.AnselToUnicode; +import org.marc4j.converter.impl.Iso5426ToUnicode; +import org.marc4j.marc.ControlField; +import org.marc4j.marc.DataField; +import org.marc4j.marc.Leader; +import org.marc4j.marc.MarcFactory; +import org.marc4j.marc.Record; +import org.marc4j.marc.Subfield; +import org.marc4j.marc.VariableField; +import org.marc4j.marc.impl.Verifier; + +import com.ibm.icu.text.Normalizer; + +/** + * An iterator over a collection of MARC records in ISO 2709 format, that is designed + * to be able to handle MARC records that have errors in their structure or their encoding. + * If the permissive flag is set in the call to the constructor, or if a ErrorHandler object + * is passed in as a parameter to the constructor, this reader will do its best to detect + * and recover from a number of structural or encoding errors that can occur in a MARC record. + * Note that if this reader is not set to read permissively, its will operate pretty much + * identically to the MarcStreamReader class. + * + * Note that no attempt is made to validate the contents of the record at a semantic level. + * This reader does not know and does not care whether the record has a 245 field, or if the + * 008 field is the right length, but if the record claims to be UTF-8 or MARC8 encoded and + * you are seeing gibberish in the output, or if the reader is throwing an exception in trying + * to read a record, then this reader may be able to produce a usable record from the bad + * data you have. + * + * The ability to directly translate the record to UTF-8 as it is being read in is useful in + * cases where the UTF-8 version of the record will be used directly by the program that is + * reading the MARC data, for instance if the marc records are to be indexed into a SOLR search + * engine. Previously the MARC record could only be translated to UTF-8 as it was being written + * out via a MarcStreamWriter or a MarcXmlWriter. + * + *

+ * Example usage: + * + *

+ * InputStream input = new FileInputStream("file.mrc");
+ * MarcReader reader = new MarcPermissiveReader(input, true, false);
+ * while (reader.hasNext()) {
+ *     Record record = reader.next();
+ *     // Process record
+ * }
+ * 
+ * + *

+ * Check the {@link org.marc4j.marc} package for examples about the use of + * the {@link org.marc4j.marc.Record} object model. + * Check the file org.marc4j.samples.PermissiveReaderExample.java for an + * example about using the MarcPermissiveStreamReader. + *

+ * + *

+ * When no encoding is given as an constructor argument the parser tries to + * resolve the encoding by looking at the character coding scheme (leader + * position 9) in MARC21 records. For UNIMARC records this position is not + * defined. If the reader is operating in permissive mode and no encoding + * is given as an constructor argument the reader will look at the leader, + * and also at the data of the record to determine to the best of its ability + * what character encoding scheme has been used to encode the data in a + * particular MARC record. + * + *

+ * + * @author Robert Haschart + * @version $Revision: 1.1 $ + * + */ +public class MarcPermissiveStreamReader implements MarcReader { + + private DataInputStream input = null; + + private Record record; + + private MarcFactory factory; + + private String encoding = "ISO8859_1"; + + // This represents the expected encoding of the data when a + // MARC record does not have a 'a' in character 9 of the leader. + private String defaultEncoding = "ISO8859_1"; + + private boolean convertToUTF8 = false; + + private boolean permissive = false; + + private CharConverter converterAnsel = null; + + private CharConverter converterUnimarc = null; + + // These are used to algorithmically determine what encoding scheme was + // used to encode the data in the Marc record + private String conversionCheck1 = null; + private String conversionCheck2 = null; + private String conversionCheck3 = null; + + private ErrorHandler errors; + + /** + * Constructs an instance with the specified input stream with possible additional functionality + * being enabled by setting permissive and/or convertToUTF8 to true. + * + * If permissive and convertToUTF8 are both set to false, it functions almost identically to the + * MarcStreamReader class. + */ + public MarcPermissiveStreamReader(InputStream input, boolean permissive, boolean convertToUTF8) { + this.permissive = permissive; + this.input = new DataInputStream(new BufferedInputStream(input)); + factory = MarcFactory.newInstance(); + this.convertToUTF8 = convertToUTF8; + errors = null; + if (permissive) + { + errors = new ErrorHandler(); + defaultEncoding = "BESTGUESS"; + } + } + + /** + * Constructs an instance with the specified input stream with possible additional functionality + * being enabled by passing in an ErrorHandler object and/or setting convertToUTF8 to true. + * + * If errors and convertToUTF8 are both set to false, it functions almost identically to the + * MarcStreamReader class. + * + * If an ErrorHandler object is passed in, that object will be used to log and track any errors + * in the records as the records are decoded. After the next() function returns, you can query + * to determine whether any errors were detected in the decoding process. + * + * See the file org.marc4j.samples.PermissiveReaderExample.java to see how this can be done. + */ + public MarcPermissiveStreamReader(InputStream input, ErrorHandler errors, boolean convertToUTF8 ) + { + if (errors != null) + { + permissive = true; + defaultEncoding = "BESTGUESS"; + } + this.input = new DataInputStream(new BufferedInputStream(input)); + factory = MarcFactory.newInstance(); + this.convertToUTF8 = convertToUTF8; + this.errors = errors; + } + + /** + * Constructs an instance with the specified input stream with possible additional functionality + * being enabled by setting permissive and/or convertToUTF8 to true. + * + * If permissive and convertToUTF8 are both set to false, it functions almost identically to the + * MarcStreamReader class. + * + * The parameter defaultEncoding is used to specify the character encoding that is used in the records + * that will be read from the input stream. If permissive is set to true, you can specify "BESTGUESS" + * as the default encoding, and the reader will attempt to determine the character encoding used in the + * records being read from the input stream. This is especially useful if you are working with records + * downloaded from an external source and the encoding is either unknown or the encoding is different from + * what the records claim to be. + */ + public MarcPermissiveStreamReader(InputStream input, boolean permissive, boolean convertToUTF8, String defaultEncoding) + { + this.permissive = permissive; + this.input = new DataInputStream(new BufferedInputStream(input)); + factory = MarcFactory.newInstance(); + this.convertToUTF8 = convertToUTF8; + this.defaultEncoding = defaultEncoding; + errors = null; + if (permissive) errors = new ErrorHandler(); + } + + /** + * Constructs an instance with the specified input stream with possible additional functionality + * being enabled by setting permissive and/or convertToUTF8 to true. + * + * If errors and convertToUTF8 are both set to false, it functions almost identically to the + * MarcStreamReader class. + * + * The parameter defaultEncoding is used to specify the character encoding that is used in the records + * that will be read from the input stream. If permissive is set to true, you can specify "BESTGUESS" + * as the default encoding, and the reader will attempt to determine the character encoding used in the + * records being read from the input stream. This is especially useful if you are working with records + * downloaded from an external source and the encoding is either unknown or the encoding is different from + * what the records claim to be. + * + * If an ErrorHandler object is passed in, that object will be used to log and track any errors + * in the records as the records are decoded. After the next() function returns, you can query + * to determine whether any errors were detected in the decoding process. + * + * See the file org.marc4j.samples.PermissiveReaderExample.java to see how this can be done. + */ + public MarcPermissiveStreamReader(InputStream input, ErrorHandler errors, boolean convertToUTF8, String defaultEncoding) + { + this.permissive = true; + this.input = new DataInputStream(new BufferedInputStream(input)); + factory = MarcFactory.newInstance(); + this.convertToUTF8 = convertToUTF8; + this.defaultEncoding = defaultEncoding; + this.errors = errors; + } + + /** + * Returns true if the iteration has more records, false otherwise. + */ + public boolean hasNext() + { + try { + if (input.available() == 0) + return false; + } catch (IOException e) { + throw new MarcException(e.getMessage(), e); + } + return true; + } + + /** + * Returns the next record in the iteration. + * + * @return Record - the record object + */ + public Record next() + { + record = factory.newRecord(); + if (errors != null) errors.reset(); + + try { + byte[] byteArray = new byte[24]; + input.readFully(byteArray); + + int recordLength = parseRecordLength(byteArray); + byte[] recordBuf = new byte[recordLength - 24]; + if (permissive) + { + input.mark(recordLength * 2); + input.readFully(recordBuf); + if (recordBuf[recordBuf.length-1] != Constants.RT) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO, + "Record terminator character not found at end of record length"); + recordBuf = rereadPermissively(input, recordBuf, recordLength); + recordLength = recordBuf.length + 24; + } + } + else + { + input.readFully(recordBuf); + } + String tmp = new String(recordBuf); + parseRecord(record, byteArray, recordBuf, recordLength); + + if (this.convertToUTF8) + { + Leader l = record.getLeader(); + l.setCharCodingScheme('a'); + record.setLeader(l); + } + return(record); + } + catch (EOFException e) { + throw new MarcException("Premature end of file encountered", e); + } + catch (IOException e) { + throw new MarcException("an error occured reading input", e); + } + } + + private byte[] rereadPermissively(DataInputStream input, byte[] recordBuf, int recordLength) throws IOException + { + int loc = arrayContainsAt(recordBuf, Constants.RT); + if (loc != -1) // stated record length is too long + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO, + "Record terminator appears before stated record length, using shorter record"); + recordLength = loc + 24; + input.reset(); + recordBuf = new byte[recordLength - 24]; + input.readFully(recordBuf); + } + else // stated record length is too short read ahead + { + loc = recordLength - 24; + int c = 0; + do + { + c = input.read(); + loc++; + } while (loc < recordLength + 100 && c != Constants.RT && c != -1); + + if (c == Constants.RT) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO, + "Record terminator appears after stated record length, reading extra bytes"); + recordLength = loc + 24; + input.reset(); + recordBuf = new byte[recordLength - 24]; + input.readFully(recordBuf); + } + else if (c == -1) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR, + "No Record terminator found, end of file reached, Terminator appended"); + recordLength = loc + 24; + input.reset(); + recordBuf = new byte[recordLength - 24 + 1]; + input.readFully(recordBuf); + recordBuf[recordBuf.length-1] = Constants.RT; + } + else + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL, + "No Record terminator found within 100 byts of stated location, giving up."); + } + } + return(recordBuf); + } + + private void parseRecord(Record record, byte[] byteArray, byte[] recordBuf, int recordLength) + { + Leader ldr; + ldr = factory.newLeader(); + ldr.setRecordLength(recordLength); + int directoryLength=0; + // These variables are used when the permissive reader is trying to make its best guess + // as to what character encoding is actually used in the record being processed. + conversionCheck1 = ""; + conversionCheck2 = ""; + conversionCheck3 = ""; + + try { + parseLeader(ldr, byteArray); + directoryLength = ldr.getBaseAddressOfData() - (24 + 1); + } + catch (IOException e) { + throw new MarcException("error parsing leader with data: " + + new String(byteArray), e); + } + catch (MarcException e) { + if (permissive) + { + if (recordBuf[recordBuf.length-1] == Constants.RT && recordBuf[recordBuf.length-2] == Constants.FT) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.WARNING, + "Error parsing leader, trying to re-read leader either shorter or longer"); + // make an attempt to recover record. + int offset = 0; + while (offset < recordBuf.length) + { + if (recordBuf[offset] == Constants.FT) + { + break; + } + offset++; + } + if (offset % 12 == 1) + { + // move one byte from body to leader, make new leader, and try again + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR, + "Leader appears to be too short, moving one byte from record body to leader, and trying again"); + byte oldBody[] = recordBuf; + recordBuf = new byte[oldBody.length-1]; + System.arraycopy(oldBody, 1, recordBuf, 0, oldBody.length-1); + directoryLength = offset-1; + ldr.setIndicatorCount(2); + ldr.setSubfieldCodeLength(2); + ldr.setImplDefined1((""+(char)byteArray[7]+" ").toCharArray()); + ldr.setImplDefined2((""+(char)byteArray[18]+(char)byteArray[19]+(char)byteArray[20]).toCharArray()); + ldr.setEntryMap("4500".toCharArray()); + if (byteArray[10] == (byte)' ' || byteArray[10] == (byte)'a') // if its ' ' or 'a' + { + ldr.setCharCodingScheme((char)byteArray[10]); + } + } + else if (offset % 12 == 11) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR, + "Leader appears to be too long, moving one byte from leader to record body, and trying again"); + byte oldBody[] = recordBuf; + recordBuf = new byte[oldBody.length+1]; + System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length); + recordBuf[0] = (byte)'0'; + directoryLength = offset+1; + ldr.setIndicatorCount(2); + ldr.setSubfieldCodeLength(2); + ldr.setImplDefined1((""+(char)byteArray[7]+" ").toCharArray()); + ldr.setImplDefined2((""+(char)byteArray[16]+(char)byteArray[17]+(char)byteArray[18]).toCharArray()); + ldr.setEntryMap("4500".toCharArray()); + if (byteArray[8] == (byte)' ' || byteArray[8] == (byte)'a') // if its ' ' or 'a' + { + ldr.setCharCodingScheme((char)byteArray[10]); + } + if (byteArray[10] == (byte)' ' || byteArray[10] == (byte)'a') // if its ' ' or 'a' + { + ldr.setCharCodingScheme((char)byteArray[10]); + } + } + else + { + throw new MarcException("error parsing leader with data: " + + new String(byteArray), e); + } + } + } + else + { + throw new MarcException("error parsing leader with data: " + + new String(byteArray), e); + } + } + char tmp[] = ldr.getEntryMap(); + if (permissive && !(""+ tmp[0]+tmp[1]+tmp[2]+tmp[3]).equals("4500")) + { + if (tmp[0] >= '0' && tmp[0] <= '9' && + tmp[1] >= '0' && tmp[1] <= '9' && + tmp[2] >= '0' && tmp[2] <= '9' && + tmp[3] >= '0' && tmp[3] <= '9') + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.WARNING, + "Unusual character found at end of leader [ "+tmp[0]+tmp[1]+tmp[2]+tmp[3]+" ]"); + } + else + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO, + "Erroneous character found at end of leader [ "+tmp[0]+tmp[1]+tmp[2]+tmp[3]+" ]; changing them to the standard \"4500\""); + ldr.setEntryMap("4500".toCharArray()); + } + } + + // if MARC 21 then check encoding + switch (ldr.getCharCodingScheme()) { + case 'a': + encoding = "UTF8"; + break; + case ' ': + if (convertToUTF8) + encoding = defaultEncoding; + else + encoding = "ISO8859_1"; + break; + default: + if (convertToUTF8) + encoding = defaultEncoding; + else + encoding = "ISO8859_1"; + break; + + } + String utfCheck; + if (encoding.equalsIgnoreCase("BESTGUESS")) + { + try + { + String marc8EscSeqCheck = new String(recordBuf, "ISO-8859-1"); + // If record has MARC8 character set selection strings, it must be MARC8 encoded + if (marc8EscSeqCheck.split("\\e[-(,)$bsp]", 2).length > 1) + { + encoding = "MARC8"; + } + else + { + boolean hasHighBitChars = false; + for (int i = 0; i < recordBuf.length; i++) + { + if (recordBuf[i] < 0) // the high bit is set + { + hasHighBitChars = true; + break; + } + } + if (!hasHighBitChars) + { + encoding = "ISO8859_1"; // You can choose any encoding you want here, the results will be the same. + } + else + { + utfCheck = new String(recordBuf, "UTF-8"); + byte byteCheck[] = utfCheck.getBytes("UTF-8"); + encoding = "UTF8"; + if (recordBuf.length == byteCheck.length) + { + for (int i = 0; i < recordBuf.length; i++) + { + if (byteCheck[i] != recordBuf[i]) + { + encoding = "MARC8-Maybe"; + break; + } + } + } + else + { + encoding = "MARC8-Maybe"; + } + } + } + } + catch (UnsupportedEncodingException e) + { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + else if (permissive && encoding.equals("UTF8")) + { + try + { + utfCheck = new String(recordBuf, "UTF-8"); + byte byteCheck[] = utfCheck.getBytes("UTF-8"); + if (recordBuf.length != byteCheck.length) + { + boolean foundESC = false; + for (int i = 0; i < recordBuf.length; i++) + { + if (recordBuf[i] == 0x1B) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO, + "Record claims to be UTF-8, but its not. Its probably MARC8."); + encoding = "MARC8-Maybe"; + foundESC = true; + break; + } + if (byteCheck[i] != recordBuf[i]) + { + encoding = "MARC8-Maybe"; + } + + } + if (!foundESC) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO, + "Record claims to be UTF-8, but its not. It may be MARC8, or maybe UNIMARC, or maybe raw ISO-8859-1 "); + } + } + if (utfCheck.contains("a$1!")) + { + encoding = "MARC8-Broken"; + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR, + "Record claims to be UTF-8, but its not. It seems to be MARC8-encoded but with missing escape codes."); + } + } + catch (UnsupportedEncodingException e) + { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + else if (permissive && !encoding.equals("UTF8")) + { + try + { + utfCheck = new String(recordBuf, "UTF-8"); + byte byteCheck[] = utfCheck.getBytes("UTF-8"); + if (recordBuf.length == byteCheck.length) + { + for (int i = 0; i < recordBuf.length; i++) + { + // need to check for byte < 0 to see if the high bit is set, because Java doesn't have unsigned types. + if (recordBuf[i] < 0x00 || byteCheck[i] != recordBuf[i]) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR, + "Record claims not to be UTF-8, but it seems to be."); + encoding = "UTF8-Maybe"; + break; + } + } + } + } + catch (UnsupportedEncodingException e) + { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + record.setLeader(ldr); + + boolean discardOneAtStartOfDirectory = false; + boolean discardOneSomewhereInDirectory = false; + + if ((directoryLength % 12) != 0) + { + if (permissive && directoryLength % 12 == 11 && recordBuf[1] != (byte)'0') + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR, + "Directory length is not a multiple of 12 bytes long. Prepending a zero and trying to continue."); + byte oldBody[] = recordBuf; + recordBuf = new byte[oldBody.length+1]; + System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length); + recordBuf[0] = (byte)'0'; + directoryLength = directoryLength+1; + } + else + { + if (permissive && directoryLength % 12 == 1 && recordBuf[1] == (byte)'0' && recordBuf[2] == (byte)'0') + { + discardOneAtStartOfDirectory = true; + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR, + "Directory length is not a multiple of 12 bytes long. Discarding byte from start of directory and trying to continue."); + } + else if (permissive && directoryLength % 12 == 1 && recordLength > 10000 && recordBuf[0] == (byte)'0' && + recordBuf[1] == (byte)'0' && recordBuf[2] > (byte)'0' && recordBuf[2] <= (byte)'9') + { + discardOneSomewhereInDirectory = true; + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR, + "Directory length is not a multiple of 12 bytes long. Will look for oversized field and try to work around it."); + } + else + { + if (errors != null) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL, + "Directory length is not a multiple of 12 bytes long. Unable to continue."); + } + throw new MarcException("Directory length is not a multiple of 12 bytes long. Unable to continue."); + } + } + } + DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf)); + int size = directoryLength / 12; + + String[] tags = new String[size]; + int[] lengths = new int[size]; + + byte[] tag = new byte[3]; + byte[] length = new byte[4]; + byte[] start = new byte[5]; + + String tmpStr; + try { + if (discardOneAtStartOfDirectory) inputrec.read(); + int totalOffset = 0; + for (int i = 0; i < size; i++) + { + inputrec.readFully(tag); + tmpStr = new String(tag); + tags[i] = tmpStr; + + boolean proceedNormally = true; + if (discardOneSomewhereInDirectory) + { + byte lenCheck[] = new byte[10]; + inputrec.mark(20); + inputrec.readFully(lenCheck); + if (byteCompare(lenCheck, 4, 5, totalOffset)) // proceed normally + { + proceedNormally = true; + } + else if (byteCompare(lenCheck, 5, 5, totalOffset)) // field length is 5 bytes! Bad Marc record, proceed normally + { + discardOneSomewhereInDirectory = false; + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR, + "Field is longer than 9999 bytes. Writing this record out will result in a bad record."); + proceedNormally = false; + } + else + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL, + "Unable to reconcile problems in directory. Unable to continue."); + throw new MarcException("Directory length is not a multiple of 12 bytes long. Unable to continue."); + } + inputrec.reset(); + } + if (proceedNormally) + { + inputrec.readFully(length); + tmpStr = new String(length); + lengths[i] = Integer.parseInt(tmpStr); + + inputrec.readFully(start); + } + else // length is 5 bytes long + { + inputrec.readFully(start); + tmpStr = new String(start); + lengths[i] = Integer.parseInt(tmpStr); + + inputrec.readFully(start); + } + totalOffset += lengths[i]; + } + + // If we still haven't found the extra byte, throw out the last byte and try to continue; + if (discardOneSomewhereInDirectory) inputrec.read(); + + if (inputrec.read() != Constants.FT) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL, + "Expected field terminator at end of directory. Unable to continue."); + throw new MarcException("expected field terminator at end of directory"); + } + + int numBadLengths = 0; + + int totalLength = 0; + for (int i = 0; i < size; i++) + { + int fieldLength = getFieldLength(inputrec); + if (fieldLength+1 != lengths[i] && permissive) + { + if (numBadLengths < 3 && (totalLength + fieldLength < recordLength + 26)) + { + numBadLengths++; + lengths[i] = fieldLength+1; + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR, + "Field length found in record different from length stated in the directory."); + if (fieldLength+1 > 9999) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR, + "Field length is greater than 9999, record cannot be represented as a binary Marc record."); + } + } + } + totalLength += lengths[i]; + if (isControlField(tags[i])) + { + byteArray = new byte[lengths[i] - 1]; + inputrec.readFully(byteArray); + + if (inputrec.read() != Constants.FT) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL, + "Expected field terminator at end of field. Unable to continue."); + throw new MarcException("expected field terminator at end of field"); + } + + ControlField field = factory.newControlField(); + field.setTag(tags[i]); + field.setData(getDataAsString(byteArray)); + record.addVariableField(field); + + } + else + { + byteArray = new byte[lengths[i]]; + inputrec.readFully(byteArray); + try { + record.addVariableField(parseDataField(tags[i], byteArray)); + } catch (IOException e) { + throw new MarcException( + "error parsing data field for tag: " + tags[i] + + " with data: " + + new String(byteArray), e); + } + } + } + + // We've determined that although the record says it is UTF-8, it is not. + // Here we make an attempt to determine the actual encoding of the data in the record. + if (permissive && conversionCheck1.length() > 1 && + conversionCheck2.length() > 1 && conversionCheck3.length() > 1) + { + guessAndSelectCorrectNonUTF8Encoding(); + } + if (inputrec.read() != Constants.RT) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL, + "Expected record terminator at end of record. Unable to continue."); + throw new MarcException("expected record terminator"); + } + } + catch (IOException e) + { + errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL, + "Error reading from data file. Unable to continue."); + throw new MarcException("an error occured reading input", e); + } + } + + private boolean byteCompare(byte[] lenCheck, int offset, int length, int totalOffset) + { + int divisor = 1; + for (int i = offset + length - 1; i >= offset; i-- , divisor *= 10) + { + if (((totalOffset / divisor) % 10) + '0' != lenCheck[i]) + { + return(false); + } + } + return true; + } + + private boolean isControlField(String tag) + { + boolean isControl = false; + try { + isControl = Verifier.isControlField(tag); + } + catch (NumberFormatException nfe) + { + if (permissive) + { + errors.addError(record.getControlNumber(), tag, "n/a", ErrorHandler.ERROR_TYPO, + "Field tag contains non-numeric characters (" + tag + ")."); + isControl = false; + } + } + return isControl; + } + + private void guessAndSelectCorrectNonUTF8Encoding() + { + int defaultPart = 0; + if (record.getVariableField("245") == null) defaultPart = 1; + int partToUse = 0; + int l1 = conversionCheck1.length(); + int l2 = conversionCheck2.length(); + int l3 = conversionCheck3.length(); + int tst; + + if (l1 < l3 && l2 == l3 && defaultPart == 0) + { + errors.addError(ErrorHandler.INFO, "MARC8 translation shorter than ISO-8859-1, choosing MARC8."); + partToUse = 0; + } + else if (l2 < l1-2 && l2 < l3-2 ) + { + errors.addError(ErrorHandler.INFO, "Unimarc translation shortest, choosing it."); + partToUse = 1; + } + else if ((tst = onlyOneStartsWithUpperCase(conversionCheck1, conversionCheck2, conversionCheck3)) != -1) + { + partToUse = tst; + } + else if (l2 < l1 && l2 < l3 ) + { + errors.addError(ErrorHandler.INFO, "Unimarc translation shortest, choosing it."); + partToUse = 1; + } + else if (conversionCheck2.equals(conversionCheck3) && !conversionCheck1.trim().contains(" ")) + { + errors.addError(ErrorHandler.INFO, "Unimarc and ISO-8859-1 translations identical, choosing ISO-8859-1."); + partToUse = 2; + } + else if (!specialCharIsBetweenLetters(conversionCheck1)) + { + errors.addError(ErrorHandler.INFO, "To few letters in translations, choosing "+(defaultPart == 0 ? "MARC8" : "Unimarc")); + partToUse = defaultPart; + } + else if (l2 == l1 && l2 == l3) + { + errors.addError(ErrorHandler.INFO, "All three version equal length. Choosing ISO-8859-1 "); + partToUse = 2; + } + else if (l2 == l3 && defaultPart == 1) + { + errors.addError(ErrorHandler.INFO, "Unimarc and ISO-8859-1 translations equal length, choosing ISO-8859-1."); + partToUse = 2; + } + else + { + errors.addError(ErrorHandler.INFO, "No Determination made, defaulting to "+ (defaultPart == 0 ? "MARC8" : "Unimarc") ); + partToUse = defaultPart; + } + List fields = record.getVariableFields(); + Iterator iter = fields.iterator(); + while (iter.hasNext()) + { + VariableField field = iter.next(); + if (field instanceof DataField) + { + DataField df = (DataField)field; + List subf = df.getSubfields(); + Iterator sfiter = subf.iterator(); + while (sfiter.hasNext()) + { + Subfield sf = sfiter.next(); + if (sf.getData().contains("%%@%%")) + { + String parts[] = sf.getData().split("%%@%%", 3); + sf.setData(parts[partToUse]); + } + } + } + } + } + + private int onlyOneStartsWithUpperCase(String conversionCheck12, String conversionCheck22, String conversionCheck32) + { + if (conversionCheck1.length() == 0 || conversionCheck2.length() == 0 || conversionCheck3.length() == 0) return -1; + String check1Parts[] = conversionCheck1.trim().split("[|]>"); + String check2Parts[] = conversionCheck2.trim().split("[|]>"); + String check3Parts[] = conversionCheck3.trim().split("[|]>"); + for (int i = 1; i < check1Parts.length && i < check2Parts.length && i < check3Parts.length; i++) + { + boolean tst1 = Character.isUpperCase(check1Parts[i].charAt(0)); + boolean tst2 = Character.isUpperCase(check2Parts[i].charAt(0)); + boolean tst3 = Character.isUpperCase(check3Parts[i].charAt(0)); + if (tst1 && !tst2 && !tst3) + return(0); + if (!tst1 && tst2 && !tst3) + return(-1); + if (!tst1 && !tst2 && tst3) + return(2); + } + return -1; + } + + private boolean specialCharIsBetweenLetters(String conversionCheck) + { + boolean bewteenLetters = true; + for (int i = 0; i < conversionCheck.length(); i++) + { + int charCode = (int)(conversionCheck.charAt(i)); + if (charCode > 0x7f) + { + bewteenLetters = false; + if (i > 0 && Character.isLetter((int)(conversionCheck.charAt(i-1))) || + (i < conversionCheck.length()-1 && Character.isLetter((int)(conversionCheck.charAt(i+1))))) + { + bewteenLetters = true; + break; + } + } + } + return(bewteenLetters); + } + + private int arrayContainsAt(byte[] byteArray, int ft) + { + for (int i = 0; i < byteArray.length; i++) + { + if (byteArray[i] == (byte)ft) return(i); + } + return(-1); + } + + private DataField parseDataField(String tag, byte[] field) throws IOException + { + if (permissive) + { + errors.setRecordID(record.getControlNumber()); + errors.setField(tag); + errors.setCurSubfield("n/a"); + cleanupBadFieldSeperators(field); + } + ByteArrayInputStream bais = new ByteArrayInputStream(field); + char ind1 = (char) bais.read(); + char ind2 = (char) bais.read(); + + DataField dataField = factory.newDataField(); + dataField.setTag(tag); + dataField.setIndicator1(ind1); + dataField.setIndicator2(ind2); + + int code; + int size; + int readByte; + byte[] data; + Subfield subfield; + while (true) { + readByte = bais.read(); + if (readByte < 0) + break; + switch (readByte) { + case Constants.US: + code = bais.read(); + if (code < 0) + throw new IOException("unexpected end of data field"); + if (code == Constants.FT) + break; + size = getSubfieldLength(bais); + data = new byte[size]; + bais.read(data); + subfield = factory.newSubfield(); + if (permissive) errors.setCurSubfield("" + (char)code); + String dataAsString = getDataAsString(data); + if (permissive && code == Constants.US) + { + code = data[0]; + dataAsString = dataAsString.substring(1); + errors.addError(ErrorHandler.MAJOR_ERROR, + "Subfield tag is a subfield separator, using first character of field as subfield tag."); + } + subfield.setCode((char) code); + subfield.setData(dataAsString); + dataField.addSubfield(subfield); + break; + case Constants.FT: + break; + } + } + return dataField; + } + + static AnselToUnicode conv = null; + + private void cleanupBadFieldSeperators(byte[] field) + { + if (conv == null) conv = new AnselToUnicode(true); + boolean hasEsc = false; + boolean inMultiByte = false; + boolean justCleaned = false; + int mbOffset = 0; + + for (int i = 0 ; i < field.length-1; i++) + { + if (field[i] == 0x1B) + { + hasEsc = true; + if ("(,)-'".indexOf((char)field[i+1]) != -1) + { + inMultiByte = false; + } + else if (i + 2 < field.length && field[i+1] == '$' && field[i+2] == '1') + { + inMultiByte = true; + mbOffset = 3; + } + else if (i + 3 < field.length && (field[i+1] == '$' || field[i+2] == '$')&& ( field[i+2] == '1' || field[i+3] == '1')) + { + inMultiByte = true; + mbOffset = 4; + } + + } + else if (inMultiByte && field[i] != 0x20) mbOffset = ( mbOffset == 0) ? 2 : mbOffset - 1; + if (inMultiByte && mbOffset == 0 && i + 2 < field.length) + { + char c; + byte f1 = field[i]; + byte f2 = field[i+1] == 0x20 ? field[i+2] : field[i+1]; + byte f3 = (field[i+1] == 0x20 || field[i+2] == 0x20) ? field[i+3] : field[i+2]; + c = conv.getMBChar(conv.makeMultibyte((char)((f1 == Constants.US) ? 0x7C : f1), + (char)((f2 == Constants.US) ? 0x7C : f2), + (char)((f3 == Constants.US) ? 0x7C : f3))); + if (c == 0 && !justCleaned) + { + errors.addError(ErrorHandler.MAJOR_ERROR, + "Bad Multibyte character found, reinterpreting data as non-multibyte data"); + inMultiByte = false; + } + else if (c == 0 && justCleaned) + { + c = conv.getMBChar(conv.makeMultibyte('!',(char)((f2 == Constants.US) ? 0x7C : f2), + (char)((f3 == Constants.US) ? 0x7C : f3))); + if (c == 0) + { + errors.addError(ErrorHandler.MAJOR_ERROR, + "Bad Multibyte character found, reinterpreting data as non-multibyte data"); + inMultiByte = false; + } + else + { + errors.addError(ErrorHandler.MAJOR_ERROR, + "Character after restored vertical bar character makes bad multibyte character, changing it to \"!\""); + field[i] = '!'; + } + } + } + justCleaned = false; + if (field[i] == Constants.US ) + { + if (inMultiByte && mbOffset != 0) + { + field[i] = 0x7C; + errors.addError(ErrorHandler.MAJOR_ERROR, + "Subfield separator found in middle of a multibyte character, changing it to a vertical bar, and continuing"); + if (field[i+1] == '0') + { + if (field[i+2] == '(' && field[i+3] == 'B' ) + { + field[i+1] = 0x1B; + errors.addError(ErrorHandler.MAJOR_ERROR, + "Character after restored vertical bar character makes bad multibyte character, changing it to ESC"); + } + else + { + field[i+1] = 0x21; + errors.addError(ErrorHandler.MAJOR_ERROR, + "Character after restored vertical bar character makes bad multibyte character, changing it to \"!\""); + } + } + justCleaned = true; + } + else if (hasEsc && !((field[i+1] >= 'a' && field[i+1] <= 'z') || (field[i+1] >= '0' && field[i+1] <= '9'))) + { + errors.addError(ErrorHandler.MAJOR_ERROR, + "Subfield separator followed by invalid subfield tag, changing separator to a vertical bar, and continuing"); + field[i] = 0x7C; + justCleaned = true; + } + else if (hasEsc && i < field.length-3 && + (field[i+1] == '0' && field[i+2] == '(' && field[i+3] == 'B' )) + { + errors.addError(ErrorHandler.MAJOR_ERROR, + "Subfield separator followed by invalid subfield tag, changing separator to a vertical bar, and continuing"); + field[i] = 0x7C; + field[i+1] = 0x1B; + justCleaned = true; + } + else if (hasEsc && (field[i+1] == '0' )) + { + errors.addError(ErrorHandler.MAJOR_ERROR, + "Subfield separator followed by invalid subfield tag, changing separator to a vertical bar, and continuing"); + field[i] = 0x7C; + field[i+1] = 0x21; + justCleaned = true; + } + else if (field[i+1] == Constants.US && field[i+2] == Constants.US ) + { + errors.addError(ErrorHandler.MAJOR_ERROR, + "Three consecutive subfield separators, changing first two to vertical bars."); + field[i] = 0x7C; + field[i+1] = 0x7C; + justCleaned = true; + } + } + } + } + + private int getFieldLength(DataInputStream bais) throws IOException + { + bais.mark(9999); + int bytesRead = 0; + while (true) { + switch (bais.read()) { + case Constants.FT: + bais.reset(); + return bytesRead; + case -1: + bais.reset(); + if (permissive) + { + errors.addError(ErrorHandler.ERROR_TYPO, + "Field not terminated trying to continue"); + return (bytesRead); + } + else + throw new IOException("Field not terminated"); + case Constants.US: + default: + bytesRead++; + } + } + } + + private int getSubfieldLength(ByteArrayInputStream bais) throws IOException { + bais.mark(9999); + int bytesRead = 0; + while (true) { + switch (bais.read()) { + case Constants.FT: + bais.reset(); + return bytesRead; + case Constants.US: + bais.reset(); + return bytesRead; + case -1: + bais.reset(); + if (permissive) + { + errors.addError(ErrorHandler.ERROR_TYPO, "Subfield not terminated trying to continue"); + return (bytesRead); + } + else + throw new IOException("subfield not terminated"); + default: + bytesRead++; + } + } + } + + private int parseRecordLength(byte[] leaderData) throws IOException { + InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream( + leaderData)); + int length = -1; + char[] tmp = new char[5]; + isr.read(tmp); + try { + length = Integer.parseInt(new String(tmp)); + } catch (NumberFormatException e) { + errors.addError(ErrorHandler.FATAL, + "Unable to parse record length, Unable to Continue"); + throw new MarcException("unable to parse record length", e); + } + return(length); + } + + private void parseLeader(Leader ldr, byte[] leaderData) throws IOException { + InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream( + leaderData)); + char[] tmp = new char[5]; + isr.read(tmp); + // Skip over bytes for record length, If we get here, its already been computed. + ldr.setRecordStatus((char) isr.read()); + ldr.setTypeOfRecord((char) isr.read()); + tmp = new char[2]; + isr.read(tmp); + ldr.setImplDefined1(tmp); + ldr.setCharCodingScheme((char) isr.read()); + char indicatorCount = (char) isr.read(); + char subfieldCodeLength = (char) isr.read(); + char baseAddr[] = new char[5]; + isr.read(baseAddr); + tmp = new char[3]; + isr.read(tmp); + ldr.setImplDefined2(tmp); + tmp = new char[4]; + isr.read(tmp); + ldr.setEntryMap(tmp); + isr.close(); + try { + ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount))); + } catch (NumberFormatException e) { + throw new MarcException("unable to parse indicator count", e); + } + try { + ldr.setSubfieldCodeLength(Integer.parseInt(String + .valueOf(subfieldCodeLength))); + } catch (NumberFormatException e) { + throw new MarcException("unable to parse subfield code length", e); + } + try { + ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr))); + } catch (NumberFormatException e) { + throw new MarcException("unable to parse base address of data", e); + } + + } + + private String getDataAsString(byte[] bytes) + { + String dataElement = null; + if (encoding.equals("UTF-8") || encoding.equals("UTF8")) + { + try { + dataElement = new String(bytes, "UTF-8"); + } + catch (UnsupportedEncodingException e) { + throw new MarcException("unsupported encoding", e); + } + } + else if (encoding.equals("UTF8-Maybe")) + { + try { + dataElement = new String(bytes, "UTF-8"); + } + catch (UnsupportedEncodingException e) { + throw new MarcException("unsupported encoding", e); + } + } + else if (encoding.equals("MARC-8") || encoding.equals("MARC8")) + { + dataElement = getMarc8Conversion(bytes); + } + else if (encoding.equalsIgnoreCase("Unimarc") || encoding.equals("IS05426")) + { + dataElement = getUnimarcConversion(bytes); + } + else if (encoding.equals("MARC8-Maybe")) + { + String dataElement1 = getMarc8Conversion(bytes); + String dataElement2 = getUnimarcConversion(bytes); + String dataElement3 = null; + try + { + dataElement3 = new String(bytes, "ISO-8859-1"); + } + catch (UnsupportedEncodingException e) + { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if (dataElement1.equals(dataElement2) && dataElement1.equals(dataElement3)) + { + dataElement = dataElement1; + } + else + { + conversionCheck1 = conversionCheck1 + "|>" + Normalizer.compose(dataElement1, false); + conversionCheck2 = conversionCheck2 + "|>" + dataElement2; + conversionCheck3 = conversionCheck3 + "|>" + dataElement3; + dataElement = dataElement1 + "%%@%%" + dataElement2 + "%%@%%" + dataElement3; + } + } + else if (encoding.equals("MARC8-Broken")) + { + try + { + dataElement = new String(bytes, "ISO-8859-1"); + } + catch (UnsupportedEncodingException e) + { + // TODO Auto-generated catch block + e.printStackTrace(); + } + String newdataElement = dataElement.replaceAll("<", "<"); + newdataElement = newdataElement.replaceAll(">", ">"); + newdataElement = newdataElement.replaceAll("&", "&"); + newdataElement = newdataElement.replaceAll("'", "'"); + newdataElement = newdataElement.replaceAll(""", "\""); + if (!newdataElement.equals(dataElement)) + { + dataElement = newdataElement; + errors.addError(ErrorHandler.ERROR_TYPO, "Subfield contains escaped html character entities, un-escaping them. "); + } + String rep1 = ""+(char)0x1b+"\\$1$1"; + String rep2 = ""+(char)0x1b+"\\(B"; + newdataElement = dataElement.replaceAll("\\$1(.)", rep1); + newdataElement = newdataElement.replaceAll("\\(B", rep2); + if (!newdataElement.equals(dataElement)) + { + dataElement = newdataElement; + errors.addError(ErrorHandler.MAJOR_ERROR, "Subfield seems to be missing MARC8 escape sequences, trying to restore them."); + } + try + { + dataElement = getMarc8Conversion(dataElement.getBytes("ISO-8859-1")); + } + catch (UnsupportedEncodingException e) + { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + else if (encoding.equals("ISO-8859-1") || encoding.equals("ISO8859_1")) + { + try { + dataElement = new String(bytes, "ISO-8859-1"); + } + catch (UnsupportedEncodingException e) { + throw new MarcException("unsupported encoding", e); + } + } + else + { + throw new MarcException("Unknown or unsupported Marc character encoding:" + encoding); + } + if (errors != null && dataElement.matches("[^&]*&[a-z]*;.*")) + { + String newdataElement = dataElement.replaceAll("<", "<"); + newdataElement = newdataElement.replaceAll(">", ">"); + newdataElement = newdataElement.replaceAll("&", "&"); + newdataElement = newdataElement.replaceAll("'", "'"); + newdataElement = newdataElement.replaceAll(""", "\""); + if (!newdataElement.equals(dataElement)) + { + dataElement = newdataElement; + errors.addError(ErrorHandler.ERROR_TYPO, "Subfield contains escaped html character entities, un-escaping them. "); + } + } + return dataElement; + } + + private boolean byteArrayContains(byte[] bytes, byte[] seq) + { + for ( int i = 0; i < bytes.length - seq.length; i++) + { + if (bytes[i] == seq[0]) + { + for (int j = 0; j < seq.length; j++) + { + if (bytes[i+j] != seq[j]) + { + break; + } + if (j == seq.length-1) return(true); + } + } + } + return(false); + } + + static byte badEsc[] = { (byte)('b'), (byte)('-'), 0x1b, (byte)('s') }; + static byte overbar[] = { (byte)(char)(0xaf) }; + + private String getMarc8Conversion(byte[] bytes) + { + String dataElement = null; + if (converterAnsel == null) converterAnsel = new AnselToUnicode(errors); + if (permissive && (byteArrayContains(bytes, badEsc) || byteArrayContains(bytes, overbar))) + { + String newDataElement = null; + try + { + dataElement = new String(bytes, "ISO-8859-1"); + newDataElement = dataElement.replaceAll("(\\e)b-\\es([psb])", "$1$2"); + if (!newDataElement.equals(dataElement)) + { + dataElement = newDataElement; + errors.addError(ErrorHandler.ERROR_TYPO, "Subfield contains odd pattern of subscript or superscript escapes. "); + } + newDataElement = dataElement.replace((char)0xaf, (char)0xe5); + if (!newDataElement.equals(dataElement)) + { + dataElement = newDataElement; + errors.addError(ErrorHandler.ERROR_TYPO, "Subfield contains 0xaf overbar character, changing it to proper MARC8 representation "); + } + dataElement = converterAnsel.convert(dataElement); + } + catch (UnsupportedEncodingException e) + { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + else + { + dataElement = converterAnsel.convert(bytes); + } + if (permissive && dataElement.matches("[^&]*&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f];.*")) + { + Pattern pattern = Pattern.compile("&#x([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]);"); + Matcher matcher = pattern.matcher(dataElement); + StringBuffer newElement = new StringBuffer(); + int prevEnd = 0; + while (matcher.find()) + { + newElement.append(dataElement.substring(prevEnd, matcher.start())); + newElement.append(getChar(matcher.group(1))); + prevEnd = matcher.end(); + } + newElement.append(dataElement.substring(prevEnd)); + dataElement = newElement.toString(); + } + return(dataElement); + } + + private String getUnimarcConversion(byte[] bytes) + { + if (converterUnimarc == null) converterUnimarc = new Iso5426ToUnicode(); + String dataElement = converterUnimarc.convert(bytes); + dataElement = dataElement.replaceAll("\u0088", ""); + dataElement = dataElement.replaceAll("\u0089", ""); +// for ( int i = 0 ; i < bytes.length; i++) +// { +// if (bytes[i] == -120 || bytes[i] == -119) +// { +// char tmp = (char)bytes[i]; +// char temp2 = dataElement.charAt(0); +// char temp3 = dataElement.charAt(4); +// int tmpi = (int)tmp; +// int tmp2 = (int)temp2; +// int tmp3 = (int)temp3; +// i = i; +// +// } +// } + if (dataElement.matches("[^<]*.*")) + { + Pattern pattern = Pattern.compile(""); + Matcher matcher = pattern.matcher(dataElement); + StringBuffer newElement = new StringBuffer(); + int prevEnd = 0; + while (matcher.find()) + { + newElement.append(dataElement.substring(prevEnd, matcher.start())); + newElement.append(getChar(matcher.group(1))); + prevEnd = matcher.end(); + } + newElement.append(dataElement.substring(prevEnd)); + dataElement = newElement.toString(); + } + return(dataElement); + + } + + private String getChar(String charCodePoint) + { + int charNum = Integer.parseInt(charCodePoint, 16); + String result = ""+((char)charNum); + return(result); + } + + public boolean isPermissive() + { + return permissive; + } + + public void setPermissive(boolean permissive) + { + this.permissive = permissive; + } + + public boolean hasErrors() + { + return(errors.hasErrors()); + } + + public List getErrors() + { + return(errors.getErrors()); + } + +} \ No newline at end of file diff --git a/src/org/marc4j/MarcStreamReader.java b/src/org/marc4j/MarcStreamReader.java index 27d85dae..96215969 100644 --- a/src/org/marc4j/MarcStreamReader.java +++ b/src/org/marc4j/MarcStreamReader.java @@ -1,4 +1,4 @@ -// $Id: MarcStreamReader.java,v 1.10 2006/12/04 18:45:44 bpeters Exp $ +// $Id: MarcStreamReader.java,v 1.11 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2004 Bas Peters * @@ -20,18 +20,27 @@ */ package org.marc4j; +import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import org.marc4j.converter.CharConverter; +import org.marc4j.converter.impl.AnselToUnicode; import org.marc4j.marc.ControlField; import org.marc4j.marc.DataField; import org.marc4j.marc.Leader; import org.marc4j.marc.MarcFactory; import org.marc4j.marc.Record; import org.marc4j.marc.Subfield; +import org.marc4j.marc.VariableField; import org.marc4j.marc.impl.Verifier; /** @@ -61,12 +70,12 @@ *

* * @author Bas Peters - * @version $Revision: 1.10 $ + * @version $Revision: 1.11 $ * */ public class MarcStreamReader implements MarcReader { - private InputStream input = null; + private DataInputStream input = null; private Record record; @@ -75,8 +84,8 @@ public class MarcStreamReader implements MarcReader { private String encoding = "ISO8859_1"; private boolean override = false; - - private boolean hasNext = true; + + private CharConverter converterAnsel = null; /** * Constructs an instance with the specified input stream. @@ -86,11 +95,10 @@ public MarcStreamReader(InputStream input) { } /** - * Constructs an instance with the specified input stream and character - * encoding. + * Constructs an instance with the specified input stream. */ public MarcStreamReader(InputStream input, String encoding) { - this.input = input; + this.input = new DataInputStream(new BufferedInputStream(input)); factory = MarcFactory.newInstance(); if (encoding != null) { this.encoding = encoding; @@ -116,117 +124,121 @@ public boolean hasNext() { * * @return Record - the record object */ - public Record next() { - Leader ldr; - int bytesRead = 0; - + public Record next() + { record = factory.newRecord(); try { byte[] byteArray = new byte[24]; - bytesRead = input.read(byteArray); - - if (bytesRead == -1) - throw new MarcException("no data to read"); - - while (bytesRead != -1 && bytesRead != byteArray.length) - bytesRead += input.read(byteArray, bytesRead, byteArray.length - - bytesRead); - - try { - ldr = parseLeader(byteArray); - } catch (IOException e) { - throw new MarcException("error parsing leader with data: " - + new String(byteArray), e); - } - - // if MARC 21 then check encoding - switch (ldr.getCharCodingScheme()) { - case ' ': - if (!override) - encoding = "ISO8859_1"; - break; - case 'a': - if (!override) - encoding = "UTF8"; - } - - record.setLeader(ldr); + input.readFully(byteArray); - int directoryLength = ldr.getBaseAddressOfData() - (24 + 1); - if ((directoryLength % 12) != 0) - throw new MarcException("invalid directory"); - int size = directoryLength / 12; - - String[] tags = new String[size]; - int[] lengths = new int[size]; + int recordLength = parseRecordLength(byteArray); + byte[] recordBuf = new byte[recordLength - 24]; + input.readFully(recordBuf); + parseRecord(record, byteArray, recordBuf, recordLength); + return(record); + } + catch (EOFException e) { + throw new MarcException("Premature end of file encountered", e); + } + catch (IOException e) { + throw new MarcException("an error occured reading input", e); + } + } + + private void parseRecord(Record record, byte[] byteArray, byte[] recordBuf, int recordLength) + { + Leader ldr; + ldr = factory.newLeader(); + ldr.setRecordLength(recordLength); + int directoryLength=0; + + try { + parseLeader(ldr, byteArray); + directoryLength = ldr.getBaseAddressOfData() - (24 + 1); + } + catch (IOException e) { + throw new MarcException("error parsing leader with data: " + + new String(byteArray), e); + } + catch (MarcException e) { + throw new MarcException("error parsing leader with data: " + + new String(byteArray), e); + } - byte[] tag = new byte[3]; - byte[] length = new byte[4]; - byte[] start = new byte[5]; + // if MARC 21 then check encoding + switch (ldr.getCharCodingScheme()) { + case ' ': + if (!override) + encoding = "ISO-8859-1"; + break; + case 'a': + if (!override) + encoding = "UTF8"; + } + record.setLeader(ldr); + + if ((directoryLength % 12) != 0) + { + throw new MarcException("invalid directory"); + } + DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf)); + int size = directoryLength / 12; - String tmp; + String[] tags = new String[size]; + int[] lengths = new int[size]; - for (int i = 0; i < size; i++) { - bytesRead = input.read(tag); + byte[] tag = new byte[3]; + byte[] length = new byte[4]; + byte[] start = new byte[5]; - while (bytesRead != -1 && bytesRead != tag.length) - bytesRead += input.read(tag, bytesRead, tag.length - - bytesRead); + String tmp; + try { + for (int i = 0; i < size; i++) + { + inputrec.readFully(tag); tmp = new String(tag); tags[i] = tmp; - - bytesRead = input.read(length); - - while (bytesRead != -1 && bytesRead != length.length) - bytesRead += input.read(length, bytesRead, length.length - - bytesRead); - + + inputrec.readFully(length); tmp = new String(length); lengths[i] = Integer.parseInt(tmp); - - bytesRead = input.read(start); - - while (bytesRead != -1 && bytesRead != start.length) - bytesRead += input.read(start, bytesRead, start.length - - bytesRead); + + inputrec.readFully(start); } - - if (input.read() != Constants.FT) - throw new MarcException( - "expected field terminator at end of directory"); - - for (int i = 0; i < size; i++) { - if (Verifier.isControlField(tags[i])) { + + if (inputrec.read() != Constants.FT) + { + throw new MarcException("expected field terminator at end of directory"); + } + + for (int i = 0; i < size; i++) + { + int fieldLength = getFieldLength(inputrec); + if (Verifier.isControlField(tags[i])) + { byteArray = new byte[lengths[i] - 1]; - bytesRead = input.read(byteArray); - - while (bytesRead != -1 && bytesRead != byteArray.length) - bytesRead += input.read(byteArray, bytesRead, - byteArray.length - bytesRead); - - if (input.read() != Constants.FT) - throw new MarcException( - "expected field terminator at end of field"); - + inputrec.readFully(byteArray); + + if (inputrec.read() != Constants.FT) + { + throw new MarcException("expected field terminator at end of field"); + } + ControlField field = factory.newControlField(); field.setTag(tags[i]); field.setData(getDataAsString(byteArray)); record.addVariableField(field); - - } else { + } + else + { byteArray = new byte[lengths[i]]; - bytesRead = input.read(byteArray); - - while (bytesRead != -1 && bytesRead != byteArray.length) - bytesRead += input.read(byteArray, bytesRead, - byteArray.length - bytesRead); - + inputrec.readFully(byteArray); + try { - record.addVariableField(parseDataField(tags[i], - byteArray)); + record.addVariableField(parseDataField(tags[i], byteArray)); } catch (IOException e) { throw new MarcException( "error parsing data field for tag: " + tags[i] @@ -235,14 +247,16 @@ record = factory.newRecord(); } } } - - if (input.read() != Constants.RT) + + if (inputrec.read() != Constants.RT) + { throw new MarcException("expected record terminator"); - - } catch (IOException e) { - throw new MarcException("an error occured reading input", e); + } + } + catch (IOException e) + { + throw new MarcException("an error occured reading input", e); } - return record; } private DataField parseDataField(String tag, byte[] field) @@ -286,6 +300,25 @@ private DataField parseDataField(String tag, byte[] field) } return dataField; } + + private int getFieldLength(DataInputStream bais) throws IOException + { + bais.mark(9999); + int bytesRead = 0; + while (true) { + switch (bais.read()) { + case Constants.FT: + bais.reset(); + return bytesRead; + case -1: + bais.reset(); + throw new IOException("Field not terminated"); + case Constants.US: + default: + bytesRead++; + } + } + } private int getSubfieldLength(ByteArrayInputStream bais) throws IOException { bais.mark(9999); @@ -305,63 +338,89 @@ private int getSubfieldLength(ByteArrayInputStream bais) throws IOException { } } - private Leader parseLeader(byte[] leaderData) throws IOException { + private int parseRecordLength(byte[] leaderData) throws IOException { InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream( leaderData)); - Leader ldr = factory.newLeader(); + int length = -1; char[] tmp = new char[5]; isr.read(tmp); try { - ldr.setRecordLength(Integer.parseInt(new String(tmp))); + length = Integer.parseInt(new String(tmp)); } catch (NumberFormatException e) { throw new MarcException("unable to parse record length", e); } + return(length); + } + + private void parseLeader(Leader ldr, byte[] leaderData) throws IOException { + InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream( + leaderData)); + char[] tmp = new char[5]; + isr.read(tmp); + // Skip over bytes for record length, If we get here, its already been computed. ldr.setRecordStatus((char) isr.read()); ldr.setTypeOfRecord((char) isr.read()); tmp = new char[2]; isr.read(tmp); ldr.setImplDefined1(tmp); ldr.setCharCodingScheme((char) isr.read()); + char indicatorCount = (char) isr.read(); + char subfieldCodeLength = (char) isr.read(); + char baseAddr[] = new char[5]; + isr.read(baseAddr); + tmp = new char[3]; + isr.read(tmp); + ldr.setImplDefined2(tmp); + tmp = new char[4]; + isr.read(tmp); + ldr.setEntryMap(tmp); + isr.close(); try { - ldr.setIndicatorCount(Integer.parseInt(String.valueOf((char) isr - .read()))); + ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount))); } catch (NumberFormatException e) { throw new MarcException("unable to parse indicator count", e); } try { ldr.setSubfieldCodeLength(Integer.parseInt(String - .valueOf((char) isr.read()))); + .valueOf(subfieldCodeLength))); } catch (NumberFormatException e) { throw new MarcException("unable to parse subfield code length", e); } - tmp = new char[5]; - isr.read(tmp); try { - ldr.setBaseAddressOfData(Integer.parseInt(new String(tmp))); + ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr))); } catch (NumberFormatException e) { throw new MarcException("unable to parse base address of data", e); } - tmp = new char[3]; - isr.read(tmp); - ldr.setImplDefined2(tmp); - tmp = new char[4]; - isr.read(tmp); - ldr.setEntryMap(tmp); - isr.close(); - return ldr; + } - private String getDataAsString(byte[] bytes) { + private String getDataAsString(byte[] bytes) + { String dataElement = null; - if (encoding != null) + if (encoding.equals("UTF-8") || encoding.equals("UTF8")) + { + try { + dataElement = new String(bytes, "UTF8"); + } + catch (UnsupportedEncodingException e) { + throw new MarcException("unsupported encoding", e); + } + } + else if (encoding.equals("MARC-8") || encoding.equals("MARC8")) + { + if (converterAnsel == null) converterAnsel = new AnselToUnicode(); + dataElement = converterAnsel.convert(bytes); + } + else if (encoding.equals("ISO-8859-1") || encoding.equals("ISO8859_1")) + { try { - dataElement = new String(bytes, encoding); - } catch (UnsupportedEncodingException e) { + dataElement = new String(bytes, "ISO-8859-1"); + } + catch (UnsupportedEncodingException e) { throw new MarcException("unsupported encoding", e); } - else - dataElement = new String(bytes); + } return dataElement; } - + } \ No newline at end of file diff --git a/src/org/marc4j/MarcXmlParserThread.java b/src/org/marc4j/MarcXmlParserThread.java index fc7c4ced..5c98b5f4 100644 --- a/src/org/marc4j/MarcXmlParserThread.java +++ b/src/org/marc4j/MarcXmlParserThread.java @@ -1,4 +1,4 @@ -// $Id: MarcXmlParserThread.java,v 1.2 2006/05/20 09:25:46 bpeters Exp $ +// $Id: MarcXmlParserThread.java,v 1.3 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2004 Bas Peters * @@ -30,7 +30,7 @@ * MARCXML data. * * @author Bas Peters - * @version $Revision: 1.2 $ + * @version $Revision: 1.3 $ */ public class MarcXmlParserThread extends Thread { @@ -115,7 +115,12 @@ public void run() { parser.parse(input); else parser.parse(input, th); - } finally { + } + catch (MarcException me) + { + queue.passException(me); + } + finally { queue.end(); } } diff --git a/src/org/marc4j/RecordStack.java b/src/org/marc4j/RecordStack.java index a0a72699..97ff7773 100644 --- a/src/org/marc4j/RecordStack.java +++ b/src/org/marc4j/RecordStack.java @@ -1,4 +1,4 @@ -// $Id: RecordStack.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $ +// $Id: RecordStack.java,v 1.2 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2004 Bas Peters * @@ -31,12 +31,12 @@ * Record objects created by MarcXmlParser. * * @author Bas Peters - * @version $Revision: 1.1 $ + * @version $Revision: 1.2 $ */ public class RecordStack { private List list; - + private RuntimeException re = null; private boolean eof = false; /** @@ -77,6 +77,7 @@ public synchronized Record pop() { } catch (Exception e) { } } + if (re != null) throw(re); Record record = null; if (list.size() > 0) record = (Record) list.remove(0); @@ -98,12 +99,23 @@ public synchronized boolean hasNext() { } catch (Exception e) { } } - + if (re != null) throw(re); if (!isEmpty() || !eof) return true; return false; } + /** + * Passes the exception to the thread where the MarcXMLReader is running, so that the next() call + * that is blocked waiting for this thread, will receive the exception. + * + */ + public synchronized void passException(RuntimeException e) { + re = e; + eof = true; + notifyAll(); + } + /** * Called when the end of the document is reached. * diff --git a/src/org/marc4j/converter/CharConverter.java b/src/org/marc4j/converter/CharConverter.java index e04b94cb..74d4e2f7 100644 --- a/src/org/marc4j/converter/CharConverter.java +++ b/src/org/marc4j/converter/CharConverter.java @@ -1,4 +1,4 @@ -//$Id: CharConverter.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $ +//$Id: CharConverter.java,v 1.2 2008/09/26 21:18:16 haschart Exp $ /** * Copyright (C) 2005 Bas Peters * @@ -25,9 +25,9 @@ * Implement this class to create a character converter. * * @author Bas Peters - * @version $Revision: 1.1 $ + * @version $Revision: 1.2 $ */ -public interface CharConverter { +public abstract class CharConverter { /** * Converts the dataElement and returns the result as a String @@ -36,6 +36,25 @@ public interface CharConverter { * @param dataElement the data to convert * @return String the conversion result */ - public String convert(String dataElement); + public abstract String convert(char[] dataElement); + + public String convert(byte[] data) + { + char cData[] = new char[data.length]; + for (int i = 0; i < data.length; i++) + { + byte b = data[i]; + cData[i] = (char)(b >= 0 ? b : 256 + b); + } + return convert(cData); + } + + public String convert(String dataElement) + { + char[] data = null; + data = dataElement.toCharArray(); + return (convert(data)); + } + } \ No newline at end of file diff --git a/src/org/marc4j/converter/impl/AnselToUnicode.java b/src/org/marc4j/converter/impl/AnselToUnicode.java index 6a0b3754..c54b29b3 100644 --- a/src/org/marc4j/converter/impl/AnselToUnicode.java +++ b/src/org/marc4j/converter/impl/AnselToUnicode.java @@ -1,4 +1,4 @@ -// $Id: AnselToUnicode.java,v 1.3 2005/12/14 17:11:30 bpeters Exp $ +// $Id: AnselToUnicode.java,v 1.4 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2002 Bas Peters (mail@bpeters.com) * @@ -21,8 +21,11 @@ package org.marc4j.converter.impl; import java.io.InputStream; +import java.lang.reflect.Constructor; import java.util.Vector; +import org.marc4j.ErrorHandler; +import org.marc4j.MarcException; import org.marc4j.converter.CharConverter; /** @@ -37,9 +40,9 @@ * * @author Bas Peters * @author Corey Keith - * @version $Revision: 1.3 $ + * @version $Revision: 1.4 $ */ -public class AnselToUnicode implements CharConverter { +public class AnselToUnicode extends CharConverter { class Queue extends Vector { @@ -101,20 +104,80 @@ public String toString() { } } - protected CodeTable ct; + protected CodeTableInterface ct; protected boolean loadedMultibyte = false; + protected ErrorHandler errorList = null; /** * Creates a new instance and loads the MARC4J supplied * conversion tables based on the official LC tables. * */ - public AnselToUnicode() { - this(AnselToUnicode.class - .getResourceAsStream("resources/codetablesnocjk.xml")); + public AnselToUnicode() + { + ct = loadGeneratedTable(false); } + + /** + * Creates a new instance and loads the MARC4J supplied + * conversion tables based on the official LC tables. + * + */ + public AnselToUnicode(boolean loadMultibyte) + { + ct = loadGeneratedTable(loadMultibyte); + } + /** + * Creates a new instance and loads the MARC4J supplied + * conversion tables based on the official LC tables. + * + */ + public AnselToUnicode(ErrorHandler errorList) + { + ct = loadGeneratedTable(false); + this.errorList = errorList; + } + + /** + * Creates a new instance and loads the MARC4J supplied + * conversion tables based on the official LC tables. + * + */ + public AnselToUnicode(ErrorHandler errorList, boolean loadMultibyte) + { + ct = loadGeneratedTable(loadMultibyte); + this.errorList = errorList; + } + + private CodeTableInterface loadGeneratedTable(boolean loadMultibyte) + { + try + { + Class generated = Class.forName("org.marc4j.converter.impl.CodeTableGenerated"); + Constructor cons = generated.getConstructor(); + Object ct = cons.newInstance(); + loadedMultibyte = true; + return((CodeTableInterface)ct); + } + catch (Exception e) + { + CodeTableInterface ct; + if (loadMultibyte) + { + ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetables.xml")); + } + else + { + ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetablesnocjk.xml")); + } + loadedMultibyte = loadMultibyte; + return(ct); + } + + } + /** * Constructs an instance with the specified pathname. * @@ -144,7 +207,7 @@ public AnselToUnicode(InputStream in) { } /** - * Loads the entire maping (including multibyte characters) from the Library + * Loads the entire mapping (including multibyte characters) from the Library * of Congress. */ private void loadMultibyte() { @@ -153,58 +216,146 @@ private void loadMultibyte() { } private void checkMode(char[] data, CodeTracker cdt) { - while (cdt.offset < data.length && isEscape(data[cdt.offset])) { - switch (data[cdt.offset + 1]) { - case 0x28: - case 0x2c: - cdt.g0 = data[cdt.offset + 2]; - cdt.offset += 3; - cdt.multibyte = false; + int extra = 0; + int extra2 = 0; + int extra3 = 0; + while (cdt.offset + extra + extra2< data.length && isEscape(data[cdt.offset])) { + switch (data[cdt.offset + 1 + extra]) { + case 0x28: // '(' + case 0x2c: // ',' + set_cdt(cdt, 0, data, 2 + extra, false); break; - case 0x29: - case 0x2d: - cdt.g1 = data[cdt.offset + 2]; - cdt.offset += 3; - cdt.multibyte = false; + case 0x29: // ')' + case 0x2d: // '-' + set_cdt(cdt, 1, data, 2 + extra, false); break; - case 0x24: - cdt.multibyte = true; + case 0x24: // '$' if (!loadedMultibyte) { loadMultibyte(); loadedMultibyte = true; } - switch (data[cdt.offset + 1]) { - case 0x29: - case 0x2d: - cdt.g1 = data[cdt.offset + 3]; - cdt.offset += 4; + switch (data[cdt.offset + 2 + extra + extra2]) { + case 0x29: // ')' + case 0x2d: // '-' + set_cdt(cdt, 1, data, 3 + extra + extra2, true); + break; + case 0x2c: // ',' + set_cdt(cdt, 0, data, 3 + extra + extra2, true); break; - case 0x2c: - cdt.g0 = data[cdt.offset + 3]; - cdt.offset += 4; + case 0x31: // '1' + cdt.g0 = data[cdt.offset + 2 + extra + extra2]; + cdt.offset += 3 + extra + extra2; + cdt.multibyte = true; break; - default: - cdt.g0 = data[cdt.offset + 2]; - cdt.offset += 3; + case 0x20: // ' ' + // space found in escape code: look ahead and try to proceed + extra2++; + break; + default: + // unknown code character found: discard escape sequence and return + cdt.offset += 1; + if (errorList != null) + { + errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); + } + else + { + throw new MarcException("Unknown character set code found following escape character."); + } break; } break; - case 0x67: - case 0x62: - case 0x70: - cdt.g0 = data[cdt.offset + 1]; - cdt.offset += 2; + case 0x67: // 'g' + case 0x62: // 'b' + case 0x70: // 'p' + cdt.g0 = data[cdt.offset + 1 + extra]; + cdt.offset += 2 + extra; cdt.multibyte = false; break; - case 0x73: + case 0x73: // 's' cdt.g0 = 0x42; - cdt.offset += 2; + cdt.offset += 2 + extra; cdt.multibyte = false; break; + case 0x20: // ' ' + // space found in escape code: look ahead and try to proceed + if (errorList == null) + { + throw new MarcException("Extraneous space character found within MARC8 character set escape sequence"); + } + extra++; + break; + default: + // unknown code character found: discard escape sequence and return + cdt.offset += 1; + if (errorList != null) + { + errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); + } + else + { + throw new MarcException("Unknown character set code found following escape character."); + } + break; } } + if (errorList != null && ( extra != 0 || extra2 != 0)) + { + errorList.addError(ErrorHandler.MINOR_ERROR, "" + (extra+extra2) + " extraneous space characters found within MARC8 character set escape sequence"); + } } + private void set_cdt(CodeTracker cdt, int g0_or_g1, char[] data, int addnlOffset, boolean multibyte) + { + if (data[cdt.offset + addnlOffset] == '!' && data[cdt.offset + addnlOffset + 1] == 'E') + { + addnlOffset++; + } + else if (data[cdt.offset + addnlOffset] == ' ') + { + if (errorList != null) + { + errorList.addError(ErrorHandler.MINOR_ERROR, "Extraneous space character found within MARC8 character set escape sequence. Skipping over space."); + } + else + { + throw new MarcException("Extraneous space character found within MARC8 character set escape sequence"); + } + addnlOffset++; + } + else if ("(,)-$!".indexOf(data[cdt.offset + addnlOffset]) != -1) + { + if (errorList != null) + { + errorList.addError(ErrorHandler.MINOR_ERROR, "Extraneaous intermediate character found following escape character. Discarding intermediate character."); + } + else + { + throw new MarcException("Extraneaous intermediate character found following escape character."); + } + addnlOffset++; + } + if ("34BE1NQS2".indexOf(data[cdt.offset + addnlOffset]) == -1) + { + cdt.offset += 1; + cdt.multibyte = false; + if (errorList != null) + { + errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); + } + else + { + throw new MarcException("Unknown character set code found following escape character."); + } + } + else // All is well, proceed normally + { + if (g0_or_g1 == 0) cdt.g0 = data[cdt.offset + addnlOffset]; + else cdt.g1 = data[cdt.offset + addnlOffset]; + cdt.offset += 1 + addnlOffset; + cdt.multibyte = multibyte; + } + } /** *

* Converts MARC-8 data to UCS/Unicode. @@ -214,9 +365,8 @@ private void checkMode(char[] data, CodeTracker cdt) { * the MARC-8 data * @return String - the UCS/Unicode data */ - public String convert(String dataElement) { - char[] data = null; - data = dataElement.toCharArray(); + public String convert(char data[]) + { StringBuffer sb = new StringBuffer(); int len = data.length; @@ -232,14 +382,17 @@ public String convert(String dataElement) { Queue diacritics = new Queue(); - while (cdt.offset < data.length) { + while (cdt.offset < data.length) + { if (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1) - && hasNext(cdt.offset, len)) { + && hasNext(cdt.offset, len)) + { while (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1) - && hasNext(cdt.offset, len)) { - diacritics.put(new Character(getChar(data[cdt.offset], - cdt.g0, cdt.g1))); + && hasNext(cdt.offset, len)) + { + char c = getChar(data[cdt.offset], cdt.g0, cdt.g1); + if (c != 0) diacritics.put(new Character(c)); cdt.offset++; checkMode(data, cdt); } @@ -247,23 +400,122 @@ && hasNext(cdt.offset, len)) { char c2 = getChar(data[cdt.offset], cdt.g0, cdt.g1); cdt.offset++; checkMode(data, cdt); - sb.append(c2); + if (c2 != 0) sb.append(c2); - while (!diacritics.isEmpty()) { + while (!diacritics.isEmpty()) + { char c1 = ((Character) diacritics.get()).charValue(); sb.append(c1); } - } else if (cdt.multibyte) { - sb.append(ct.getChar(makeMultibyte(new String(data).substring( - cdt.offset, cdt.offset + 4).toCharArray()), cdt.g0)); - cdt.offset += 3; - } else { - sb.append(getChar(data[cdt.offset], cdt.g0, cdt.g1)); + } + else if (cdt.multibyte) + { + if (data[cdt.offset]== 0x20) + { + // if a 0x20 byte occurs amidst a sequence of multibyte characters + // skip over it and output a space. + // Hmmm. If the following line is present it seems to output two spaces + // when a space occurs in multibytes chars, without it one seems to be output. + // sb.append(getChar(data[cdt.offset], cdt.g0, cdt.g1)); + cdt.offset += 1; + } + else if (cdt.offset + 3 <= data.length && (errorList == null || data[cdt.offset+1]!= 0x20 && data[cdt.offset+2]!= 0x20)) + { + char c = getMBChar(makeMultibyte(data[cdt.offset], data[cdt.offset+1], data[cdt.offset+2])); + if (errorList == null || c != 0) + { + sb.append(c); + cdt.offset += 3; + } + else if (cdt.offset + 6 <= data.length && data[cdt.offset+4]!= 0x20 && data[cdt.offset+5]!= 0x20 && + getMBChar(makeMultibyte(data[cdt.offset+3], data[cdt.offset+4], data[cdt.offset+5])) != 0) + { + if (errorList != null) + { + errorList.addError(ErrorHandler.MAJOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters"); + sb.append("[?]"); + cdt.offset += 3; + } + } + else if (cdt.offset + 4 <= data.length && data[cdt.offset] > 0x7f && + getMBChar(makeMultibyte(data[cdt.offset+1], data[cdt.offset+2], data[cdt.offset+3])) != 0) + { + if (errorList != null) + { + errorList.addError(ErrorHandler.MAJOR_ERROR, "Erroneous character in MARC8 multibyte character, Copying bad character and continuing reading Multibyte characters"); + sb.append(getChar(data[cdt.offset], 0x42, 0x45)); + cdt.offset += 1; + } + } + else + { + if (errorList != null) + { + errorList.addError(ErrorHandler.MAJOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set"); + } + cdt.multibyte = false; + cdt.g0 = 0x42; + cdt.g1 = 0x45; + } + } + else if (errorList != null && cdt.offset + 4 <= data.length && ( data[cdt.offset+1] == 0x20 || data[cdt.offset+2]== 0x20)) + { + int multiByte = makeMultibyte( data[cdt.offset], ((data[cdt.offset+1] != 0x20)? data[cdt.offset+1] : data[cdt.offset+2]), data[cdt.offset+3]); + char c = getMBChar(multiByte); + if (c != 0) + { + if (errorList != null) + { + errorList.addError(ErrorHandler.ERROR_TYPO, "Extraneous space found within MARC8 multibyte character"); + } + sb.append(c); + sb.append(' '); + cdt.offset += 4; + } + else + { + if (errorList != null) + { + errorList.addError(ErrorHandler.MAJOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set"); + } + cdt.multibyte = false; + cdt.g0 = 0x42; + cdt.g1 = 0x45; + } + } + else if (cdt.offset + 3 > data.length) + { + if (errorList != null) + { + errorList.addError(ErrorHandler.MAJOR_ERROR, "Partial MARC8 multibyte character, inserting change to default character set"); + cdt.multibyte = false; + cdt.g0 = 0x42; + cdt.g1 = 0x45; + } + // if a field ends with an incomplete encoding of a multibyte character + // simply discard that final partial character. + else + { + cdt.offset += 3; + } + } + } + else + { + char c = getChar(data[cdt.offset], cdt.g0, cdt.g1); + if (c != 0) sb.append(c); + else + { + String val = "0000"+Integer.toHexString((int)(data[cdt.offset])); + sb.append("" ); + } cdt.offset += 1; } if (hasNext(cdt.offset, len)) + { checkMode(data, cdt); + } } return sb.toString(); } @@ -275,6 +527,15 @@ private int makeMultibyte(char[] data) { chars[2] = data[2]; return chars[0] | chars[1] | chars[2]; } + + public int makeMultibyte(char c1, char c2, char c3) + { + int[] chars = new int[3]; + chars[0] = c1 << 16; + chars[1] = c2 << 8; + chars[2] = c3; + return chars[0] | chars[1] | chars[2]; + } private char getChar(int ch, int g0, int g1) { if (ch <= 0x7E) @@ -283,7 +544,7 @@ private char getChar(int ch, int g0, int g1) { return ct.getChar(ch, g1); } - private char getMBChar(int ch) { + public char getMBChar(int ch) { return ct.getChar(ch, 0x31); } diff --git a/src/org/marc4j/converter/impl/CodeTable.java b/src/org/marc4j/converter/impl/CodeTable.java index 586efd30..ed125ec4 100644 --- a/src/org/marc4j/converter/impl/CodeTable.java +++ b/src/org/marc4j/converter/impl/CodeTable.java @@ -1,4 +1,4 @@ -// $Id: CodeTable.java,v 1.2 2005/12/14 17:11:30 bpeters Exp $ +// $Id: CodeTable.java,v 1.3 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2002 Bas Peters * @@ -23,8 +23,11 @@ import java.io.File; import java.io.FileInputStream; import java.io.InputStream; +import java.io.PrintStream; import java.net.URI; -import java.util.Hashtable; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; import java.util.Vector; import javax.xml.parsers.SAXParser; @@ -41,21 +44,21 @@ *

* * @author Corey Keith - * @version $Revision: 1.2 $ + * @version $Revision: 1.3 $ * */ -public class CodeTable { - protected static Hashtable charsets = null; +public class CodeTable implements CodeTableInterface { + protected static HashMap charsets = null; - protected static Hashtable combining = null; + protected static HashMap combining = null; public boolean isCombining(int i, int g0, int g1) { if (i <= 0x7E) { Vector v = (Vector) combining.get(new Integer(g0)); - return v.contains(new Integer(i)); + return (v != null && v.contains(new Integer(i))); } else { Vector v = (Vector) combining.get(new Integer(g1)); - return v.contains(new Integer(i)); + return (v != null && v.contains(new Integer(i))); } } @@ -63,7 +66,7 @@ public char getChar(int c, int mode) { if (c == 0x20) return (char) c; else { - Hashtable charset = (Hashtable) charsets.get(new Integer(mode)); + HashMap charset = (HashMap) charsets.get(new Integer(mode)); if (charset == null) { // System.err.println("Hashtable not found: " @@ -72,18 +75,13 @@ public char getChar(int c, int mode) { } else { Character ch = (Character) charset.get(new Integer(c)); if (ch == null) { - - int newc; - if (c < 0x80) - newc = c + 0x80; - else - newc = c - 0x80; + int newc = (c < 0x80) ? c + 0x80 : c - 0x80; ch = (Character) charset.get(new Integer(newc)); if (ch == null) { // System.err.println("Character not found: " // + Integer.toHexString(c) + " in Code Table: " // + Integer.toHexString(mode)); - return (char) c; + return (char) 0; } else return ch.charValue(); } else @@ -92,6 +90,144 @@ public char getChar(int c, int mode) { } } + public void dumpTableAsSwitchStatement(PrintStream output) + { + output.println("package org.marc4j.converter.impl;"); + output.println(""); + output.println("// Warning: This file is generated by running the main routine in the file CodeTable.java "); + output.println("// Warning: Do not edit this file, or all edits will be lost at the next build. "); + output.println("public class CodeTableGenerated implements CodeTableInterface {"); + output.println("\tpublic boolean isCombining(int i, int g0, int g1) {"); + output.println("\t\tswitch (i <= 0x7E ? g0 : g1) {"); + Object combiningKeys[] = combining.keySet().toArray(); + Arrays.sort(combiningKeys); + for (int combiningSel = 0; combiningSel < combiningKeys.length; combiningSel++) + { + Integer nextKey = (Integer)combiningKeys[combiningSel]; + output.println("\t\t\tcase 0x"+Integer.toHexString(nextKey)+":"); + Vector v = (Vector) combining.get(nextKey); + Iterator vIter = v.iterator(); + if (vIter.hasNext()) + { + output.println("\t\t\t\tswitch(i) {"); + while (vIter.hasNext()) + { + Integer vVal = (Integer)vIter.next(); + output.println("\t\t\t\t\tcase 0x"+Integer.toHexString(vVal)+":"); + } + output.println("\t\t\t\t\t\treturn(true);"); + output.println("\t\t\t\t\tdefault:"); + output.println("\t\t\t\t\t\treturn(false);"); + output.println("\t\t\t\t}"); + } + else + { + output.println("\t\t\t\treturn(false);"); + } + } + output.println("\t\t\tdefault:"); + output.println("\t\t\t\treturn(false);"); + output.println("\t\t\t}"); + output.println("\t}"); + output.println(""); + output.println("\tpublic char getChar(int c, int mode) {"); + output.println("\t\tint code = getCharCode(c, mode);"); + output.println("\t\tif (code == -1) return((char)0);"); + output.println("\t\tif (code != 0) return((char)code);"); + output.println("\t\tcode = getCharCode(c < 0x80 ? c + 0x80 : c - 0x80 , mode);"); + output.println("\t\treturn((char)code);"); + output.println("\t}"); + output.println(""); + output.println("\tprivate int getCharCode(int c, int mode) {"); + output.println("\t\tif (c == 0x20) return c;"); + output.println("\t\tswitch (mode) {"); + Object charsetsKeys[] = charsets.keySet().toArray(); + Arrays.sort(charsetsKeys); + for (int charsetSel = 0; charsetSel < charsetsKeys.length; charsetSel++) + { + Integer nextKey = (Integer)charsetsKeys[charsetSel]; + output.println("\t\t\tcase 0x"+Integer.toHexString(nextKey)+":"); + if (nextKey.intValue() == 0x31) + { + output.println("\t\t\t\treturn(getMultiByteChar(c));"); + } + else + { + HashMap map = (HashMap) charsets.get(nextKey); + Object keyArray[] = map.keySet().toArray(); + Arrays.sort(keyArray); + output.println("\t\t\t\tswitch(c) {"); + for (int sel = 0; sel < keyArray.length; sel++) + { + Integer mKey = (Integer)keyArray[sel]; + Character c = (Character)map.get(mKey); + if (c != null) + output.println("\t\t\t\t\tcase 0x"+Integer.toHexString(mKey)+": return(0x"+Integer.toHexString((int)c.charValue())+"); "); + else + output.println("\t\t\t\t\tcase 0x"+Integer.toHexString(mKey)+": return(0); "); + } + output.println("\t\t\t\t\tdefault: return(0);"); + output.println("\t\t\t\t}"); + } + } + output.println("\t\t\tdefault: return(-1); // unknown charset specified "); + output.println("\t\t}"); + output.println("\t}"); + output.println(""); + StringBuffer getMultiByteFunc = new StringBuffer(); + getMultiByteFunc.append("\tpublic int getMultiByteChar(int c) {\n"); + + HashMap map = (HashMap) charsets.get(new Integer(0x31)); + Object keyArray[] = map.keySet().toArray(); + Arrays.sort(keyArray); + + dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x210000, 0x214fff); + dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x215000, 0x21ffff); +// dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x210000, 0x21ffff); + dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x220000, 0x22ffff); + dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x230000, 0x27ffff); + dumpPartialMultiByteTable(output, getMultiByteFunc, keyArray, map, 0x280000, 0x7f7fff); + + getMultiByteFunc.append("\t\treturn(0);\n"); + getMultiByteFunc.append("\t}"); + output.println(getMultiByteFunc.toString()); + + output.println("}"); + + } + + public void dumpPartialMultiByteTable(PrintStream output, StringBuffer buffer, Object keyArray[], HashMap map, int startByte, int endByte) + { + String startByteStr = "0x"+Integer.toHexString(startByte); + String endByteStr = "0x"+Integer.toHexString(endByte); + buffer.append("\t\tif (c >= "+startByteStr+" && c <= "+endByteStr+") return (getMultiByteChar_"+startByteStr+"_"+endByteStr+"(c));\n"); + + output.println("\tpublic char getMultiByteChar_"+startByteStr+"_"+endByteStr+"(int c) {"); + output.println("\t\tswitch(c) {"); + for (int sel = 0; sel < keyArray.length; sel++) + { + Integer mKey = (Integer)keyArray[sel]; + Character c = (Character)map.get(mKey); + if (mKey >= startByte && mKey <= endByte) + { + if (c != null) + output.println("\t\t\tcase 0x"+Integer.toHexString(mKey)+": return((char)0x"+Integer.toHexString((int)c.charValue())+"); "); + else + output.println("\t\t\tcase 0x"+Integer.toHexString(mKey)+": return((char)0); "); + } + } + output.println("\t\t\tdefault: return((char)0);"); + output.println("\t\t}"); + output.println("\t}"); + output.println(""); + } + + public static void main(String args[]) + { + CodeTable ct = new CodeTable(CodeTable.class.getResourceAsStream("resources/codetables.xml")); + ct.dumpTableAsSwitchStatement(System.out); + } + public CodeTable(InputStream byteStream) { try { diff --git a/src/org/marc4j/converter/impl/CodeTableHandler.java b/src/org/marc4j/converter/impl/CodeTableHandler.java index 3ea843dc..9a9a4321 100644 --- a/src/org/marc4j/converter/impl/CodeTableHandler.java +++ b/src/org/marc4j/converter/impl/CodeTableHandler.java @@ -1,4 +1,4 @@ -// $Id: CodeTableHandler.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $ +// $Id: CodeTableHandler.java,v 1.2 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2002 Bas Peters * @@ -22,7 +22,7 @@ import java.io.File; import java.io.FileInputStream; -import java.util.Hashtable; +import java.util.HashMap; import java.util.Vector; import javax.xml.parsers.SAXParser; @@ -41,17 +41,17 @@ * builds a data structure to facilitate AnselToUnicode character conversion. * * @author Corey Keith - * @version $Revision: 1.1 $ + * @version $Revision: 1.2 $ * * @see DefaultHandler */ public class CodeTableHandler extends DefaultHandler { - private Hashtable sets; + private HashMap sets; - private Hashtable charset; + private HashMap charset; - private Hashtable combiningchars; + private HashMap combiningchars; /** Data element identifier */ private Integer isocode; @@ -75,11 +75,11 @@ public class CodeTableHandler extends DefaultHandler { /** Locator object */ private Locator locator; - public Hashtable getCharSets() { + public HashMap getCharSets() { return sets; } - public Hashtable getCombiningChars() { + public HashMap getCombiningChars() { return combiningchars; } @@ -98,14 +98,14 @@ public void setDocumentLocator(Locator locator) { public void startElement(String uri, String name, String qName, Attributes atts) throws SAXParseException { if (name.equals("characterSet")) { - charset = new Hashtable(); + charset = new HashMap(); isocode = Integer.valueOf(atts.getValue("ISOcode"), 16); combining = new Vector(); } else if (name.equals("marc")) data = new StringBuffer(); else if (name.equals("codeTables")) { - sets = new Hashtable(); - combiningchars = new Hashtable(); + sets = new HashMap(); + combiningchars = new HashMap(); } else if (name.equals("ucs")) data = new StringBuffer(); else if (name.equals("alt")) @@ -135,7 +135,7 @@ public void endElement(String uri, String name, String qName) if (data.length() > 0) ucs = new Character((char) Integer.parseInt(data.toString(), 16)); else - useAlt = true; + ucs = null; } else if (name.equals("alt")) { if (useAlt && data.length() > 0) { ucs = new Character((char) Integer.parseInt(data.toString(), 16)); @@ -155,7 +155,7 @@ public void endElement(String uri, String name, String qName) } public static void main(String[] args) { - Hashtable charsets = null; + HashMap charsets = null; try { diff --git a/src/org/marc4j/converter/impl/CodeTableInterface.java b/src/org/marc4j/converter/impl/CodeTableInterface.java new file mode 100644 index 00000000..b8b31bed --- /dev/null +++ b/src/org/marc4j/converter/impl/CodeTableInterface.java @@ -0,0 +1,7 @@ +package org.marc4j.converter.impl; + +public interface CodeTableInterface +{ + public boolean isCombining(int i, int g0, int g1); + public char getChar(int c, int mode); +}; diff --git a/src/org/marc4j/converter/impl/Iso5426ToUnicode.java b/src/org/marc4j/converter/impl/Iso5426ToUnicode.java index 371735a3..91974521 100644 --- a/src/org/marc4j/converter/impl/Iso5426ToUnicode.java +++ b/src/org/marc4j/converter/impl/Iso5426ToUnicode.java @@ -1,4 +1,4 @@ -// $Id: Iso5426ToUnicode.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $ +// $Id: Iso5426ToUnicode.java,v 1.2 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2002 Bas Peters (mail@bpeters.com) * Copyright (C) 2002 Yves Pratter (ypratter@club-internet.fr) @@ -30,9 +30,9 @@ * * @author Bas Peters * @author Yves Pratter - * @version $Revision: 1.1 $ + * @version $Revision: 1.2 $ */ -public class Iso5426ToUnicode implements CharConverter { +public class Iso5426ToUnicode extends CharConverter { /** *

@@ -43,8 +43,7 @@ public class Iso5426ToUnicode implements CharConverter { * the UNIMARC data * @return {@link String}- the UCS/Unicode data */ - public String convert(String dataElement) { - char[] data = dataElement.toCharArray(); + public String convert(char data[]) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < data.length; i++) { diff --git a/src/org/marc4j/converter/impl/Iso6937ToUnicode.java b/src/org/marc4j/converter/impl/Iso6937ToUnicode.java index 35e938b0..21c24ccd 100644 --- a/src/org/marc4j/converter/impl/Iso6937ToUnicode.java +++ b/src/org/marc4j/converter/impl/Iso6937ToUnicode.java @@ -1,4 +1,4 @@ -// $Id: Iso6937ToUnicode.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $ +// $Id: Iso6937ToUnicode.java,v 1.2 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2002 Bas Peters (mail@bpeters.com) * Copyright (C) 2002 Yves Pratter (ypratter@club-internet.fr) @@ -30,9 +30,9 @@ * * @author Bas Peters * @author Yves Pratter - * @version $Revision: 1.1 $ + * @version $Revision: 1.2 $ */ -public class Iso6937ToUnicode implements CharConverter { +public class Iso6937ToUnicode extends CharConverter { /** *

@@ -43,8 +43,7 @@ public class Iso6937ToUnicode implements CharConverter { * the ISO 6937 data * @return {@link String}- the UCS/Unicode data */ - public String convert(String dataElement) { - char[] data = dataElement.toCharArray(); + public String convert(char data[]) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < data.length; i++) { diff --git a/src/org/marc4j/converter/impl/UnicodeToAnsel.java b/src/org/marc4j/converter/impl/UnicodeToAnsel.java index 5831bde4..774ca8ad 100644 --- a/src/org/marc4j/converter/impl/UnicodeToAnsel.java +++ b/src/org/marc4j/converter/impl/UnicodeToAnsel.java @@ -1,4 +1,4 @@ -// $Id: UnicodeToAnsel.java,v 1.2 2005/11/28 16:50:22 bpeters Exp $ +// $Id: UnicodeToAnsel.java,v 1.3 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2002 Bas Peters (mail@bpeters.com) * @@ -36,9 +36,9 @@ * * @author Bas Peters * @author Corey Keith - * @version $Revision: 1.2 $ + * @version $Revision: 1.3 $ */ -public class UnicodeToAnsel implements CharConverter { +public class UnicodeToAnsel extends CharConverter { protected ReverseCodeTable rct; static final char ESC = 0x1b; @@ -97,8 +97,7 @@ public UnicodeToAnsel(InputStream in) { * the UCS/Unicode data * @return String - the MARC-8 data */ - public String convert(String dataElement) { - char[] data = dataElement.toCharArray(); + public String convert(char data[]) { StringBuffer sb = new StringBuffer(); CodeTableTracker ctt = new CodeTableTracker(); diff --git a/src/org/marc4j/converter/impl/UnicodeToIso5426.java b/src/org/marc4j/converter/impl/UnicodeToIso5426.java index e2bfa958..ee139579 100644 --- a/src/org/marc4j/converter/impl/UnicodeToIso5426.java +++ b/src/org/marc4j/converter/impl/UnicodeToIso5426.java @@ -1,4 +1,4 @@ -// $Id: UnicodeToIso5426.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $ +// $Id: UnicodeToIso5426.java,v 1.2 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2002 Bas Peters (mail@bpeters.com) * @@ -29,9 +29,9 @@ * * @author Bas Peters * @author Yves Pratter - * @version $Revision: 1.1 $ + * @version $Revision: 1.2 $ */ -public class UnicodeToIso5426 implements CharConverter { +public class UnicodeToIso5426 extends CharConverter { /** *

@@ -46,8 +46,7 @@ public class UnicodeToIso5426 implements CharConverter { * the UCS/Unicode data * @return {@link String}- the UNIMARC (ISO 5426 charset) data */ - public String convert(String dataElement) { - char[] data = dataElement.toCharArray(); + public String convert(char data[]) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < data.length; i++) { char c = data[i]; diff --git a/src/org/marc4j/converter/impl/UnicodeToIso6937.java b/src/org/marc4j/converter/impl/UnicodeToIso6937.java index a6f66eff..d6c5d8f0 100644 --- a/src/org/marc4j/converter/impl/UnicodeToIso6937.java +++ b/src/org/marc4j/converter/impl/UnicodeToIso6937.java @@ -1,4 +1,4 @@ -// $Id: UnicodeToIso6937.java,v 1.1 2005/05/04 10:06:46 bpeters Exp $ +// $Id: UnicodeToIso6937.java,v 1.2 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2002 Bas Peters (mail@bpeters.com) * @@ -29,9 +29,9 @@ * * @author Bas Peters * @author Yves Pratter - * @version $Revision: 1.1 $ + * @version $Revision: 1.2 $ */ -public class UnicodeToIso6937 implements CharConverter { +public class UnicodeToIso6937 extends CharConverter { /** *

@@ -46,8 +46,7 @@ public class UnicodeToIso6937 implements CharConverter { * the UCS/Unicode data * @return {@link String}- the ISO 6937 data */ - public String convert(String dataElement) { - char[] data = dataElement.toCharArray(); + public String convert(char data[]) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < data.length; i++) { char c = data[i]; diff --git a/src/org/marc4j/marc/impl/RecordImpl.java b/src/org/marc4j/marc/impl/RecordImpl.java index d6f59cc8..d5a32c5b 100644 --- a/src/org/marc4j/marc/impl/RecordImpl.java +++ b/src/org/marc4j/marc/impl/RecordImpl.java @@ -1,4 +1,4 @@ -// $Id: RecordImpl.java,v 1.4 2006/08/04 12:29:01 bpeters Exp $ +// $Id: RecordImpl.java,v 1.5 2008/09/26 21:17:43 haschart Exp $ /** * Copyright (C) 2004 Bas Peters * @@ -35,7 +35,7 @@ * Represents a MARC record. * * @author Bas Peters - * @version $Revision: 1.4 $ + * @version $Revision: 1.5 $ */ public class RecordImpl implements Record { @@ -179,7 +179,9 @@ public List getVariableFields() { } public String getControlNumber() { - return new String(getControlNumberField().getData()); + ControlField f = getControlNumberField(); + String result = (f == null || f.getData() == null) ? null : new String(f.getData()); + return(result); } public List getVariableFields(String[] tags) { diff --git a/src/org/marc4j/marc/impl/Verifier.java b/src/org/marc4j/marc/impl/Verifier.java index d9b6970d..2f3a7bbf 100644 --- a/src/org/marc4j/marc/impl/Verifier.java +++ b/src/org/marc4j/marc/impl/Verifier.java @@ -1,4 +1,4 @@ -// $Id: Verifier.java,v 1.1 2005/05/04 10:06:47 bpeters Exp $ +// $Id: Verifier.java,v 1.2 2008/09/26 21:17:42 haschart Exp $ /** * Copyright (C) 2004 Bas Peters * @@ -29,7 +29,7 @@ * Handles MARC checks on tags, data elements and Record objects. * * @author Bas Peters - * @version $Revision: 1.1 $ + * @version $Revision: 1.2 $ */ public class Verifier { @@ -40,8 +40,8 @@ private Verifier() { * Returns true if the given String value identifies a tag for * a control field (001 through 009). */ - public static boolean isControlField(String tag) throws NumberFormatException { - if (Integer.parseInt(tag) < 10) + public static boolean isControlField(String tag) { + if (tag.length() == 3 && tag.charAt(0) == '0' && tag.charAt(1) == '0' && tag.charAt(2) >= '0' && tag.charAt(2) <= '9')// if (Integer.parseInt(tag) < 10) return true; return false; } @@ -50,14 +50,12 @@ public static boolean isControlField(String tag) throws NumberFormatException { * Returns true if the given String value identifies a tag for * a control number field (001). */ - public static boolean isControlNumberField(String tag) - throws NumberFormatException { - if (Integer.parseInt(tag) == 1) + public static boolean isControlNumberField(String tag){ + if (tag.equals("001")) return true; return false; } - - /** +/** * Returns true if the given Collection contains an instance of * a ControlField with a control number field tag (001). * diff --git a/src/org/marc4j/samples/HandleExceptionExample.java b/src/org/marc4j/samples/HandleExceptionExample.java new file mode 100644 index 00000000..4bcae7aa --- /dev/null +++ b/src/org/marc4j/samples/HandleExceptionExample.java @@ -0,0 +1,57 @@ +// $Id: HandleExceptionExample.java,v 1.1 2008/09/26 21:17:42 haschart Exp $ +/** + * Copyright (C) 2002-2006 Bas Peters + * + * This file is part of MARC4J + * + * MARC4J is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * MARC4J is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with MARC4J; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.marc4j.samples; + +import java.io.InputStream; + +import org.marc4j.MarcException; +import org.marc4j.MarcReader; +import org.marc4j.MarcStreamReader; +import org.marc4j.marc.Record; + +/** + * Reads MARC input. + * + * @author Bas Peters + * @version $Revision: 1.1 $ + */ +public class HandleExceptionExample { + + public static void main(String args[]) throws Exception { + + InputStream input = HandleExceptionExample.class + .getResourceAsStream("resources/error.mrc"); + + try { + MarcReader reader = new MarcStreamReader(input); + while (reader.hasNext()) { + Record record = reader.next(); + System.out.println(record.toString()); + } + } catch (MarcException e) { + System.out.println("something went wrong man!"); + + } + + System.out.println("damn!"); + } + +} \ No newline at end of file diff --git a/src/org/marc4j/samples/PermissiveReaderExample.java b/src/org/marc4j/samples/PermissiveReaderExample.java new file mode 100644 index 00000000..de167ba0 --- /dev/null +++ b/src/org/marc4j/samples/PermissiveReaderExample.java @@ -0,0 +1,229 @@ +package org.marc4j.samples; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintStream; +import java.io.UnsupportedEncodingException; +import java.util.Iterator; +import java.util.List; + +import org.marc4j.ErrorHandler; +import org.marc4j.MarcException; +import org.marc4j.MarcPermissiveStreamReader; +import org.marc4j.MarcReader; +import org.marc4j.MarcStreamWriter; +import org.marc4j.MarcWriter; +import org.marc4j.marc.Record; + +public class PermissiveReaderExample +{ + + /** + * This test program demonstrates the use of the MarcPermissiveStreamReader + * to read Marc records, with the permissive setting turned on. It also + * demonstrates the capability of printing out the error messages that are + * generated when the MarcPermissiveStreamReader encounters records with + * structural error or encoding errors. + * + * When run in verbose mode, (by passing -v as the first parameter) the + * program will display the entire record highlighting the lines in the + * record that have errors that the permissive reader was able to detect + * and make an attempt at correcting. Following that the program will + * list all of the errors that it found in the record. + * + * When run in verbose mode as described above, the program is useful for + * validating records. + * + * Shown below is the output generated when the program is run on the file + * error.mrc found in the resources sub-directory in the samples directory: + * + * Fatal Exception: error parsing data field for tag: 250 with data: a1st ed. + * Typo : Record terminator character not found at end of record length --- [ n/a : n/a ] + * Typo : Record terminator appears after stated record length, reading extra bytes --- [ n/a : n/a ] + * Minor Error : Field length found in record different from length stated in the directory. --- [ n/a : n/a ] + * LEADER 00715cam a2200205 a 4500 + * 001 12883376 + * 005 20030616111422.0 + * 008 020805s2002 nyu j 000 1 eng + * 020 $a0786808772 + * 020 $a0786816155 (pbk.) + * 040 $aDLC$cDLC$dDLC + * 100 1 $aChabon, Michael. + * 245 10$aSummerland /$cMichael Chabon. + * 250 $a1st ed. + * 260 $aNew York :$bMiramax Books/Hyperion Books for Children,$cc2002. + * 300 $a500 p. ;$c22 cm. + * 520 $aEthan Feld, the worst baseball player in the history of the game, finds himself recruited by a 100-year-old scout to help a band of fairies triumph over an ancient enemy. + * 650 1$aFantasy. + * 650 1$aBaseball$vFiction. + * 650 1$aMagic$vFiction. + */ + public static void main(String[] args) + { + PrintStream out = System.out; + boolean verbose = Boolean.parseBoolean(System.getProperty("marc.verbose")); + boolean veryverbose = Boolean.parseBoolean(System.getProperty("marc.verbose")); + if (args[0].equals("-v")) + { + verbose = true; + String newArgs[] = new String[args.length-1]; + System.arraycopy(args, 1, newArgs, 0, args.length-1); + args = newArgs; + } + if (args[0].equals("-vv")) + { + verbose = true; + veryverbose = true; + String newArgs[] = new String[args.length-1]; + System.arraycopy(args, 1, newArgs, 0, args.length-1); + args = newArgs; + } + String fileStr = args[0]; + File file = new File(fileStr); + MarcReader readerNormal = null; + MarcReader readerPermissive = null; + boolean to_utf_8 = true; + + InputStream inNorm; + InputStream inPerm; + OutputStream patchedRecStream = null; + MarcWriter patchedRecs = null; + ErrorHandler errorHandler = new ErrorHandler(); + try + { + inNorm = new FileInputStream(file); + readerNormal = new MarcPermissiveStreamReader(inNorm, false, to_utf_8); + inPerm = new FileInputStream(file); + readerPermissive = new MarcPermissiveStreamReader(inPerm, errorHandler, to_utf_8, "BESTGUESS"); + } + catch (FileNotFoundException e) + { + // TODO Auto-generated catch block + e.printStackTrace(); + } + boolean done = false; + if (args.length > 1) + { + try + { + patchedRecStream = new FileOutputStream(new File(args[1])); + patchedRecs = new MarcStreamWriter(patchedRecStream); + } + catch (FileNotFoundException e) + { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + while (readerNormal.hasNext() && readerPermissive.hasNext()) + { + Record recNorm; + Record recPerm; + recPerm = readerPermissive.next(); + String strPerm = recPerm.toString(); + try { + recNorm = readerNormal.next(); + } + catch (MarcException me) + { + if (verbose) + { + out.println("Fatal Exception: "+ me.getMessage()); + dumpErrors(out, errorHandler); + showDiffs(out, null, strPerm); + out.println("-------------------------------------------------------------------------------------"); + } + continue; + } + String strNorm = recNorm.toString(); + if (!strNorm.equals(strPerm)) + { + if (verbose) + { + dumpErrors(out, errorHandler); + showDiffs(out, strNorm, strPerm); + out.println("-------------------------------------------------------------------------------------"); + + } + if (patchedRecs != null) + { + patchedRecs.write(recPerm); + } + } + else if (errorHandler.hasErrors()) + { + if (verbose) + { + out.println("Results identical, but errors reported"); + dumpErrors(out, errorHandler); + showDiffs(out, strNorm, strPerm); + out.println("-------------------------------------------------------------------------------------"); + } + if (patchedRecs != null) + { + patchedRecs.write(recPerm); + } + } + else if (veryverbose) + { + showDiffs(out, strNorm, strPerm); + } + + } + } + + public static void showDiffs(PrintStream out, String strNorm, String strPerm) + { + if (strNorm != null) + { + String normLines[] = strNorm.split("\n"); + String permLines[] = strPerm.split("\n"); + if (normLines.length == permLines.length) + { + for (int i = 0; i < normLines.length; i++) + { + if (normLines[i].equals(permLines[i])) + { + out.println(" " + normLines[i]); + } + else + { + out.println(" < " + normLines[i]); + out.println(" > " + permLines[i]); + } + } + } + } + else + { + String permLines[] = strPerm.split("\n"); + for (int i = 0; i < permLines.length; i++) + { + out.println(" " + permLines[i]); + } + } + + } + + public static void dumpErrors(PrintStream out, ErrorHandler errorHandler) + { + List errors = errorHandler.getErrors(); + if (errors != null) + { + Iterator iter = errors.iterator(); + while (iter.hasNext()) + { + Object error = iter.next(); + if (((ErrorHandler.Error)(error)).getSeverity() >= ErrorHandler.MINOR_ERROR) + { + int i = 10; + } + out.println(error.toString()); + } + } + } +} diff --git a/src/org/marc4j/samples/resources/diacritic4.mrc b/src/org/marc4j/samples/resources/diacritic4.mrc new file mode 100644 index 00000000..1972ded5 --- /dev/null +++ b/src/org/marc4j/samples/resources/diacritic4.mrc @@ -0,0 +1 @@ +03059cam 2200301 i 4500001001300000003000400013005001700017008004100034040001800075010001700093245005400110260002100164300002100185500019000206500019000396500015700586500018600743500019700929500023801126500022301364500020801587500016801795500020701963500020902170500016102379500014102540500007602681 77123332 DLC20051218154744.0981008b2001 ilu 000 0 eng  aDLCcDLCdDLC a 77123332 00aOCLC diacritic and special character test record. any :bny,c2001. a100 p. ;c12 cm. aVOYAGER COLUMN 0 (NEW): Degree sign (À); Phono Copyright mark (Â); Copyright mark (Ã); Sharp (Ä); Inverted Question mark (Å); Inverted Exclamation mark (Æ); Eszett (Ç); Euro (È). aVOYAGER COLUMN 1: Script L (Á); Polish L (¡); Scandanavian O (¢); D with Crossbar (£); Icelandic Thorn (¤); AE Digraph (¥); OE Digraph (¦); Miagkii Znak (§); Dot at Midline (¨). aVOYAGER COLUMN 2: Musical Flat (©); Patent Mark (ª); Plus or Minus («); O Hook (¬); U Hook (­); Alif (®); alpha (gas); Ayn (°); Polish l (±). aVOYAGER COLUMN 3: Scandanavian o (²); d with crossbar (³); Icelandic Thorn (´); ae Digraph (µ); oe Digraph (¶); Tverdii Znak (·); Turkish i (¸); British Pound (¹); eth (º). aVOYAGER COLUMN 4: Dagger (DO NOT USE); o Hook (¼); u Hook (½); Beta (gbs); Gamma (gcs); Superscript 0 (p0s); Superscript 1 (p1s); Superscript 2 (p2s); Superscript 3 (p3s). aVOYAGER COLUMN 5: Superscript 4 (p4s); Superscript 5 (p5s); Superscript 6 (p6s); Superscript 7 (p7s); Superscript 8 (p8s); Superscript 9 (p9s); Superscript + (p+s); Superscript - (p-s); Superscript ( (p(s). aVOYAGER COLUMN 6: Superscript ) (p)s); Subscript 0 (b0s); Subscript 1 (b1s); Subscript 2 (b2s); Subscript 3 (b3s); Subscript 4 (b4s); Subscript 5 (b5s); Subscript 6 (b6s); Subscript 7 (b7s). aVOYAGER COLUMN 7: Subscript 8 (b8s); Subscript 9 (b9s); Subscript + (b+s); Subscript - (b-s); Subscript ( (b(s); Subscript ) (b)s); Pseudo Question Mark (ào); Grave (áo); Acute (âo). aVOYAGER COLUMN 8: Circumflex (ão); Tilde (äo); Macron (åo); Breve (æo); Superior Dot (ço); Umlaut (èo); Hacek (éo); Circle Above (êo); Ligature left (ëo). aVOYAGER COLUMN 9: Ligature right (ìo) ; High Comma off center (ío); Double Acute (îo); Candrabindu (ïo); Cedilla (ðo); Right Hook (ño); Dot Below (òo); Double Dot Below (óo); Circle Below (ôo). aVOYAGER COLUMN 10: Double Underscore (õo); Underscore (öo); Left Hook (÷o); Right Cedilla (øo); Upadhmaniya (ùo); Double Tilde 1st half (úo); Double Tilde 2nd half (ûo) ; High Comma centered (þo). aVOYAGER PC Keyboard: Spacing Circumflex (^); Spacing Underscore (_); Spacing Grave (`); Open Curly Bracket ({); Close Curly Bracket (}); Spacing Tilde (~). aStandard PC Keyboard: 1234567890-= !@#$%^&*()_+ qwertyuiop[]\ QWERTYUIOP{}| asdfghjkl;' ASDFGHJKL:" zxcvbnm,./ ZXCVBNM<>? aDouble Tilde, 1st and 2nd halves (úoûo) ; Ligature, both halves (ëoìo). \ No newline at end of file diff --git a/src/org/marc4j/samples/resources/error.mrc b/src/org/marc4j/samples/resources/error.mrc new file mode 100644 index 00000000..cc8c4c94 --- /dev/null +++ b/src/org/marc4j/samples/resources/error.mrc @@ -0,0 +1 @@ +00714cam a2200205 a 45000010009000000050017000090080041000260200015000670200022000820400018001041000021001222450034001432500012001772600067001893000021002565200175002776500013004526500023004656500020004881288337620030616111422.0020805s2002 nyu j 000 1 eng  a0786808772 a0786816155 (pbk.) aDLCcDLCdDLC1 aChabon, Michael.10aSummerland /cMichael Chabon. a1st ed. aNew York :bMiramax Books/Hyperion Books for Children,cc2002. a500 p. ;c22 cm. aEthan Feld, the worst baseball player in the history of the game, finds himself recruited by a 100-year-old scout to help a band of fairies triumph over an ancient enemy. 1aFantasy. 1aBaseballvFiction. 1aMagicvFiction. \ No newline at end of file