From 36a131b48e96b845dc0698ddb39ea4cd1e85c71a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 12 May 2020 23:02:10 -0400 Subject: [PATCH 1/9] Used scalafix and scalafmt to clean up the code. --- .../scala/org/renci/umls/CodeMapper.scala | 89 ++++--- .../scala/org/renci/umls/db/DbConcepts.scala | 247 ++++++++++-------- .../scala/org/renci/umls/db/DbHierarchy.scala | 38 +-- .../scala/org/renci/umls/rrf/RRFCols.scala | 36 ++- .../org/renci/umls/rrf/RRFConcepts.scala | 87 +++--- .../scala/org/renci/umls/rrf/RRFDir.scala | 9 +- .../scala/org/renci/umls/rrf/RRFFile.scala | 3 +- .../scala/org/renci/umls/rrf/RRFFiles.scala | 8 +- .../org/renci/umls/rrf/RRFHierarchy.scala | 40 ++- 9 files changed, 301 insertions(+), 256 deletions(-) diff --git a/src/main/scala/org/renci/umls/CodeMapper.scala b/src/main/scala/org/renci/umls/CodeMapper.scala index ca0251f..88f8ff9 100644 --- a/src/main/scala/org/renci/umls/CodeMapper.scala +++ b/src/main/scala/org/renci/umls/CodeMapper.scala @@ -38,21 +38,14 @@ object CodeMapper extends App with LazyLogging { default = Some(new File("./sqlite.db")) ) - val fromSource: ScallopOption[String] = opt[String]( - descr = "The source to translate from" - ) + val fromSource: ScallopOption[String] = opt[String](descr = "The source to translate from") - val toSource: ScallopOption[String] = opt[String]( - descr = "The source to translate to" - ) + val toSource: ScallopOption[String] = opt[String](descr = "The source to translate to") - val idFile: ScallopOption[File] = opt[File]( - descr = "A file containing identifiers (in a single, newline-delimited column)" - ) + val idFile: ScallopOption[File] = + opt[File](descr = "A file containing identifiers (in a single, newline-delimited column)") - val outputFile: ScallopOption[File] = opt[File]( - descr = "Where to write the output file" - ) + val outputFile: ScallopOption[File] = opt[File](descr = "Where to write the output file") verify() } @@ -83,7 +76,9 @@ object CodeMapper extends App with LazyLogging { // Do we need to filter first? // Get ready to write output! - val stream = if (conf.outputFile.isEmpty) System.out else new PrintStream(new FileOutputStream(conf.outputFile())) + val stream = + if (conf.outputFile.isEmpty) System.out + else new PrintStream(new FileOutputStream(conf.outputFile())) // Both sourceFrom and sourceTo are set! if (conf.idFile.isEmpty) { @@ -92,9 +87,9 @@ object CodeMapper extends App with LazyLogging { maps.foreach(map => { stream.println( s"${map.fromSource}\t${map.fromCode}\t" + - s"${map.toSource}\t${map.toCode}\t" + - s"${map.conceptIds.mkString(", ")}\t" + - s"${map.labels.mkString("|")}" + s"${map.toSource}\t${map.toCode}\t" + + s"${map.conceptIds.mkString(", ")}\t" + + s"${map.labels.mkString("|")}" ) }) } else { @@ -105,33 +100,43 @@ object CodeMapper extends App with LazyLogging { val map = concepts.getMap(conf.fromSource(), ids, conf.toSource(), Seq.empty) val allTermCuis = concepts.getCUIsForCodes(conf.fromSource(), ids) - stream.println("fromSource\tid\tcuis\tlabels\tcountDirect\tcountViaParent\ttoIds\ttoLabels\tparentCuis\tparentSource\tparentIds\tparentLabels") + stream.println( + "fromSource\tid\tcuis\tlabels\tcountDirect\tcountViaParent\ttoIds\ttoLabels\tparentCuis\tparentSource\tparentIds\tparentLabels" + ) var count = 0 val mapByFromId = map.groupBy(_.fromCode) val matched = ids.map(id => { val maps = mapByFromId.getOrElse(id, Seq()) - val (parentStr, parentHalfMaps) = if (maps.nonEmpty) ("", Seq.empty) else { - val termCuis = allTermCuis.getOrElse(id, Seq.empty) - // logger.info(s"Checking $termCuis for parent AUI information.") - - val termAtomIds = concepts.getAUIsForCUIs(termCuis) - val parentAtomIds = rrfDir.hierarchy.getParents(termAtomIds) - val parentCUIs = concepts.getCUIsForAUI(parentAtomIds.toSeq) - val halfMaps = if(parentCUIs.isEmpty) Seq.empty else concepts.getMapsByCUIs(parentCUIs.toSeq, conf.toSource()) - - val cuis = halfMaps.map(_.cui).toSet - val sources = halfMaps.map(_.source).toSet - val codes = halfMaps.map(_.code).toSet - val labels = halfMaps.map(_.label).toSet - - (s"\t${cuis.mkString("|")}\t${sources.mkString("|")}\t${codes.mkString("|")}\t${labels.mkString("|")}", halfMaps) - } + val (parentStr, parentHalfMaps) = + if (maps.nonEmpty) ("", Seq.empty) + else { + val termCuis = allTermCuis.getOrElse(id, Seq.empty) + // logger.info(s"Checking $termCuis for parent AUI information.") + + val termAtomIds = concepts.getAUIsForCUIs(termCuis) + val parentAtomIds = rrfDir.hierarchy.getParents(termAtomIds) + val parentCUIs = concepts.getCUIsForAUI(parentAtomIds.toSeq) + val halfMaps = + if (parentCUIs.isEmpty) Seq.empty + else concepts.getMapsByCUIs(parentCUIs.toSeq, conf.toSource()) + + val cuis = halfMaps.map(_.cui).toSet + val sources = halfMaps.map(_.source).toSet + val codes = halfMaps.map(_.code).toSet + val labels = halfMaps.map(_.label).toSet + + (s"\t${cuis.mkString("|")}\t${sources.mkString("|")}\t${codes.mkString("|")}\t${labels + .mkString("|")}", halfMaps) + } val halfMaps = halfMapByCode.getOrElse(id, Seq()) stream.println( - s"${conf.fromSource()}\t$id\t${halfMaps.map(_.cui).toSet.mkString("|")}\t${halfMaps.map(_.label).toSet.mkString("|")}\t${maps.size}\t${parentHalfMaps.size}" + s"${conf.fromSource()}\t$id\t${halfMaps.map(_.cui).toSet.mkString("|")}\t${halfMaps + .map(_.label) + .toSet + .mkString("|")}\t${maps.size}\t${parentHalfMaps.size}" + s"\t${maps.map(m => m.toSource + ":" + m.toCode).mkString("|")}" + s"\t${maps.map(_.labels.mkString(";")).mkString("|")}" + s"$parentStr" @@ -139,7 +144,7 @@ object CodeMapper extends App with LazyLogging { count += 1 if (count % 100 == 0) { - val percentage = count.toFloat/ids.size * 100 + val percentage = count.toFloat / ids.size * 100 logger.info(f"Processed $count out of ${ids.size} IDs ($percentage%.2f%%)") } @@ -150,12 +155,16 @@ object CodeMapper extends App with LazyLogging { val matchedParent = matched.filter(_._2.nonEmpty).flatMap(_._2) val matchedTotal = matched.filter(m => m._1.nonEmpty || m._2.nonEmpty) - val percentageTerm = (matchedTerm.size.toFloat/ids.size) * 100 - val percentageParent = (matchedParent.size.toFloat/ids.size) * 100 - val percentageTotal = (matchedTotal.size.toFloat/ids.size) * 100 + val percentageTerm = (matchedTerm.size.toFloat / ids.size) * 100 + val percentageParent = (matchedParent.size.toFloat / ids.size) * 100 + val percentageTotal = (matchedTotal.size.toFloat / ids.size) * 100 logger.info(f"Matched ${matchedTerm.size} IDs out of ${ids.size} ($percentageTerm%.2f%%)") - logger.info(f"Matched a further ${matchedParent.size} IDs via the parent term ($percentageParent%.2f%%)") - logger.info(f"Total coverage: ${matchedTotal.size} IDs out of ${ids.size} ($percentageTotal%.2f%%)") + logger.info( + f"Matched a further ${matchedParent.size} IDs via the parent term ($percentageParent%.2f%%)" + ) + logger.info( + f"Total coverage: ${matchedTotal.size} IDs out of ${ids.size} ($percentageTotal%.2f%%)" + ) } stream.close() diff --git a/src/main/scala/org/renci/umls/db/DbConcepts.scala b/src/main/scala/org/renci/umls/db/DbConcepts.scala index abc6a19..a7be05b 100644 --- a/src/main/scala/org/renci/umls/db/DbConcepts.scala +++ b/src/main/scala/org/renci/umls/db/DbConcepts.scala @@ -19,7 +19,9 @@ import scala.collection.mutable import scala.io.Source /** A wrapper for RRFConcepts that uses SQLite */ -class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RRFConcepts(file, filename) with LazyLogging { +class DbConcepts(db: ConnectionFactory, file: File, filename: String) + extends RRFConcepts(file, filename) + with LazyLogging { implicit val halfMapCache: Cache[Seq[HalfMap]] = CaffeineCache[Seq[HalfMap]] /** The name of the table used to store this information. We include the SHA-256 hash so we reload it if it changes. */ @@ -63,7 +65,7 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR val insertStmt = conn.prepareStatement( s"INSERT INTO $tableName (CUI, LAT, TS, LUI, STT, SUI, ISPREF, AUI, SAUI, SCUI, SDUI, SAB, TTY, CODE, STR, SRL, SUPPRESS, CVF) " + - "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" ) var count = 0 @@ -77,7 +79,7 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR count += 1 if (count % 100000 == 0) { - val percentage = count.toFloat/rowCount*100 + val percentage = count.toFloat / rowCount * 100 logger.info(f"Batched $count rows out of $rowCount ($percentage%.2f%%), executing.") insertStmt.executeBatch() insertStmt.clearBatch() @@ -96,10 +98,12 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR def getSources(): Seq[(String, Int)] = { val conn = db.createConnection() val query = conn.createStatement() - val rs = query.executeQuery(s"SELECT SAB, COUNT(*) AS count FROM $tableName GROUP BY SAB ORDER BY count DESC;") + val rs = query.executeQuery( + s"SELECT SAB, COUNT(*) AS count FROM $tableName GROUP BY SAB ORDER BY count DESC;" + ) var results = Seq[(String, Int)]() - while(rs.next()) { + while (rs.next()) { results = results :+ ( rs.getString(1), rs.getInt(2) @@ -111,56 +115,22 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR } // We use the CUIs to map everything from the fromSource to the toSource. - case class HalfMap(cui: String, aui: String, source: String, code:String, label:String) - - def getHalfMapsForCodes(source: String, ids: Seq[String]): Seq[HalfMap] = memoizeSync(Some(2.seconds)) { - // Retrieve all the fromIds. - val conn = db.createConnection() - if (ids.isEmpty) { - val query = conn.prepareStatement(s"SELECT CUI, AUI, SAB, CODE, STR FROM $tableName WHERE SAB=?") - query.setString(1, source) - val rs = query.executeQuery() - - logger.info(s"Loading halfmaps for $source") - var halfMap = Seq[HalfMap]() - var count = 0 - while(rs.next()) { - halfMap = HalfMap( - rs.getString(1), - rs.getString(2), - rs.getString(3), - rs.getString(4), - rs.getString(5) - ) +: halfMap - count += 1 - if (count % 100000 == 0) { - logger.info(s"Loaded $count halfmaps.") - } - } - - conn.close() - logger.info(s"${halfMap.size} halfmaps loaded.") - - halfMap - } else { - logger.info(s"Loading halfmaps for $source with identifiers: $ids.") - - var halfMap = Seq[HalfMap]() - var count = 0 - - val windowSize = (ids.size/10) + 1 - ids.sliding(windowSize, windowSize).foreach(idGroup => { - val indexedIds = idGroup.toIndexedSeq - val questions = idGroup.map(_ => "?").mkString(", ") - val query = conn.prepareStatement(s"SELECT DISTINCT CUI, AUI, SAB, CODE, STR FROM $tableName WHERE SAB=? AND CODE IN ($questions)") - + case class HalfMap(cui: String, aui: String, source: String, code: String, label: String) + + def getHalfMapsForCodes(source: String, ids: Seq[String]): Seq[HalfMap] = + memoizeSync(Some(2.seconds)) { + // Retrieve all the fromIds. + val conn = db.createConnection() + if (ids.isEmpty) { + val query = + conn.prepareStatement(s"SELECT CUI, AUI, SAB, CODE, STR FROM $tableName WHERE SAB=?") query.setString(1, source) - (0 until idGroup.size).foreach(id => { - query.setString(id + 2, indexedIds(id)) - }) - val rs = query.executeQuery() - while(rs.next()) { + + logger.info(s"Loading halfmaps for $source") + var halfMap = Seq[HalfMap]() + var count = 0 + while (rs.next()) { halfMap = HalfMap( rs.getString(1), rs.getString(2), @@ -169,17 +139,57 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR rs.getString(5) ) +: halfMap count += 1 + if (count % 100000 == 0) { + logger.info(s"Loaded $count halfmaps.") + } } - logger.info(s"Loaded $count halfmaps.") - }) - - conn.close() - logger.info(s"${halfMap.size} halfmaps loaded.") - - halfMap + conn.close() + logger.info(s"${halfMap.size} halfmaps loaded.") + + halfMap + } else { + logger.info(s"Loading halfmaps for $source with identifiers: $ids.") + + var halfMap = Seq[HalfMap]() + var count = 0 + + val windowSize = (ids.size / 10) + 1 + ids + .sliding(windowSize, windowSize) + .foreach(idGroup => { + val indexedIds = idGroup.toIndexedSeq + val questions = idGroup.map(_ => "?").mkString(", ") + val query = conn.prepareStatement( + s"SELECT DISTINCT CUI, AUI, SAB, CODE, STR FROM $tableName WHERE SAB=? AND CODE IN ($questions)" + ) + + query.setString(1, source) + (0 until idGroup.size).foreach(id => { + query.setString(id + 2, indexedIds(id)) + }) + + val rs = query.executeQuery() + while (rs.next()) { + halfMap = HalfMap( + rs.getString(1), + rs.getString(2), + rs.getString(3), + rs.getString(4), + rs.getString(5) + ) +: halfMap + count += 1 + } + + logger.info(s"Loaded $count halfmaps.") + }) + + conn.close() + logger.info(s"${halfMap.size} halfmaps loaded.") + + halfMap + } } - } case class Mapping( fromSource: String, @@ -190,65 +200,69 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR atomIds: Set[String], labels: Set[String] ) - def getMap(fromSource: String, fromIds: Seq[String], toSource: String, toIds: Seq[String]): Seq[Mapping] = { + def getMap( + fromSource: String, + fromIds: Seq[String], + toSource: String, + toIds: Seq[String] + ): Seq[Mapping] = { val fromHalfMaps = getHalfMapsForCodes(fromSource, fromIds) val toHalfMaps = getHalfMapsForCodes(toSource, toIds) // Combine the halfmaps so we need to. - (fromHalfMaps ++ toHalfMaps).groupBy(_.cui).values.flatMap({ entries => - // Everything in entries is the "same" concept according to MRCONSO. - // So we partition this based on - val cuis = entries.map(_.cui).toSet - val auis = entries.map(_.aui).toSet - val labels = entries.map(_.label).toSet - val fromCodes = entries.filter(_.source == fromSource).map(_.code).toSet[String] - val toCodes = entries.filter(_.source == toSource).map(_.code).toSet[String] - - fromCodes.flatMap(fromCode => { - toCodes.map(toCode => { - Mapping( - fromSource, - fromCode, - toSource, - toCode, - cuis, - auis, - labels - ) + (fromHalfMaps ++ toHalfMaps) + .groupBy(_.cui) + .values + .flatMap({ entries => + // Everything in entries is the "same" concept according to MRCONSO. + // So we partition this based on + val cuis = entries.map(_.cui).toSet + val auis = entries.map(_.aui).toSet + val labels = entries.map(_.label).toSet + val fromCodes = entries.filter(_.source == fromSource).map(_.code).toSet[String] + val toCodes = entries.filter(_.source == toSource).map(_.code).toSet[String] + + fromCodes.flatMap(fromCode => { + toCodes.map(toCode => { + Mapping(fromSource, fromCode, toSource, toCode, cuis, auis, labels) + }) }) }) - }).toSeq + .toSeq } // Look up maps by CUIs. // TODO: we might want to be able to call this without source. - def getMapsByCUIs(cuis: Seq[String], toSource: String): Seq[HalfMap] = memoizeSync(Some(2.seconds)) { - if (cuis.isEmpty) return Seq() + def getMapsByCUIs(cuis: Seq[String], toSource: String): Seq[HalfMap] = + memoizeSync(Some(2.seconds)) { + if (cuis.isEmpty) return Seq() + + val conn = db.createConnection() + val questions = cuis.map(_ => "?").mkString(", ") + val query = conn.prepareStatement( + s"SELECT DISTINCT CUI, AUI, SAB, CODE, STR FROM $tableName WHERE SAB=? AND CUI IN ($questions)" + ) + query.setString(1, toSource) + val indexedSeq = cuis.toIndexedSeq + (1 to cuis.size).foreach(index => { + query.setString(index + 1, indexedSeq(index - 1)) + }) - val conn = db.createConnection() - val questions = cuis.map(_ => "?").mkString(", ") - val query = conn.prepareStatement(s"SELECT DISTINCT CUI, AUI, SAB, CODE, STR FROM $tableName WHERE SAB=? AND CUI IN ($questions)") - query.setString(1, toSource) - val indexedSeq = cuis.toIndexedSeq - (1 to cuis.size).foreach(index => { - query.setString(index + 1, indexedSeq(index - 1)) - }) + var halfMaps = Seq[HalfMap]() + val rs = query.executeQuery() + while (rs.next()) { + halfMaps = HalfMap( + rs.getString(1), + rs.getString(2), + rs.getString(3), + rs.getString(4), + rs.getString(5) + ) +: halfMaps + } + conn.close() - var halfMaps = Seq[HalfMap]() - val rs = query.executeQuery() - while(rs.next()) { - halfMaps = HalfMap( - rs.getString(1), - rs.getString(2), - rs.getString(3), - rs.getString(4), - rs.getString(5) - ) +: halfMaps + halfMaps } - conn.close() - - halfMaps - } // Get the CUIs for given AUIs. def getCUIsForAUI(auis: Seq[String]): Set[String] = { @@ -256,7 +270,8 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR val conn = db.createConnection() val questions = auis.map(_ => "?").mkString(", ") - val query = conn.prepareStatement(s"SELECT DISTINCT CUI FROM $tableName WHERE AUI IN ($questions)") + val query = + conn.prepareStatement(s"SELECT DISTINCT CUI FROM $tableName WHERE AUI IN ($questions)") val indexedSeq = auis.toIndexedSeq (1 to auis.size).foreach(index => { query.setString(index, indexedSeq(index - 1)) @@ -264,7 +279,7 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR var results = Seq[String]() val rs = query.executeQuery() - while(rs.next()) { + while (rs.next()) { results = rs.getString(1) +: results } conn.close() @@ -277,7 +292,8 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR val conn = db.createConnection() val questions = cuis.map(_ => "?").mkString(", ") - val query = conn.prepareStatement(s"SELECT DISTINCT AUI FROM $tableName WHERE CUI IN ($questions)") + val query = + conn.prepareStatement(s"SELECT DISTINCT AUI FROM $tableName WHERE CUI IN ($questions)") val indexedSeq = cuis.toIndexedSeq (1 to cuis.size).foreach(index => { query.setString(index, indexedSeq(index - 1)) @@ -285,7 +301,7 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR var results = Seq[String]() val rs = query.executeQuery() - while(rs.next()) { + while (rs.next()) { results = rs.getString(1) +: results } conn.close() @@ -298,7 +314,9 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR val conn = db.createConnection() val questions = ids.map(_ => "?").mkString(", ") - val query = conn.prepareStatement(s"SELECT DISTINCT CODE, CUI FROM $tableName WHERE SAB=? AND CODE IN ($questions)") + val query = conn.prepareStatement( + s"SELECT DISTINCT CODE, CUI FROM $tableName WHERE SAB=? AND CODE IN ($questions)" + ) query.setString(1, source) val indexedSeq = ids.toIndexedSeq (1 to ids.size).foreach(index => { @@ -307,7 +325,7 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR var results = Seq[(String, String)]() val rs = query.executeQuery() - while(rs.next()) { + while (rs.next()) { results = (rs.getString(1), rs.getString(2)) +: results } conn.close() @@ -318,5 +336,6 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) extends RR object DbConcepts { /** Wrap an RRF file using a database to cache results. */ - def fromDatabase(db: ConnectionFactory, rrfFile: RRFFile) = new DbConcepts(db, rrfFile.file, rrfFile.filename) + def fromDatabase(db: ConnectionFactory, rrfFile: RRFFile) = + new DbConcepts(db, rrfFile.file, rrfFile.filename) } diff --git a/src/main/scala/org/renci/umls/db/DbHierarchy.scala b/src/main/scala/org/renci/umls/db/DbHierarchy.scala index e4dcc1c..0e451cd 100644 --- a/src/main/scala/org/renci/umls/db/DbHierarchy.scala +++ b/src/main/scala/org/renci/umls/db/DbHierarchy.scala @@ -15,19 +15,21 @@ import scala.io.Source /** Represents a single hierarchy entry. */ case class HierarchyEntry( - ConceptId: String, // CUI - AtomId: String, // AUI - ContextNumber: String, // CXN - ParentAtomId: String, // PAUI - Source: String, // SAB - Relation: String, // RELA - PathToRoot: String, // PTR - HierarchyCode: String, // HCD - ContentViewFlag: String // CVF - ) + ConceptId: String, // CUI + AtomId: String, // AUI + ContextNumber: String, // CXN + ParentAtomId: String, // PAUI + Source: String, // SAB + Relation: String, // RELA + PathToRoot: String, // PTR + HierarchyCode: String, // HCD + ContentViewFlag: String // CVF +) /** A wrapper for RRFHierarchy that uses SQLite */ -class DbHierarchy(db: ConnectionFactory, file: File, filename: String) extends RRFHierarchy(file, filename) with LazyLogging { +class DbHierarchy(db: ConnectionFactory, file: File, filename: String) + extends RRFHierarchy(file, filename) + with LazyLogging { /** The name of the table used to store this information. We include the SHA-256 hash so we reload it if it changes. */ val tableName: String = "MRHIER_" + sha256 @@ -60,7 +62,7 @@ class DbHierarchy(db: ConnectionFactory, file: File, filename: String) extends R val insertStmt = conn.prepareStatement( s"INSERT INTO $tableName (CUI, AUI, CXN, PAUI, SAB, RELA, PTR, HCD, CVF) " + - "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)" + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)" ) var count = 0 @@ -74,7 +76,7 @@ class DbHierarchy(db: ConnectionFactory, file: File, filename: String) extends R count += 1 if (count % 100000 == 0) { - val percentage = count.toFloat/rowCount*100 + val percentage = count.toFloat / rowCount * 100 logger.info(f"Batched $count rows out of $rowCount ($percentage%.2f%%), executing.") insertStmt.executeBatch() insertStmt.clearBatch() @@ -95,7 +97,8 @@ class DbHierarchy(db: ConnectionFactory, file: File, filename: String) extends R val conn = db.createConnection() val questions = atomIds.map(_ => "?").mkString(", ") - val query = conn.prepareStatement(s"SELECT DISTINCT PAUI FROM $tableName WHERE AUI IN ($questions)") + val query = + conn.prepareStatement(s"SELECT DISTINCT PAUI FROM $tableName WHERE AUI IN ($questions)") val indexedSeq = atomIds.toIndexedSeq (1 to atomIds.size).foreach(index => { query.setString(index, indexedSeq(index - 1)) @@ -103,7 +106,7 @@ class DbHierarchy(db: ConnectionFactory, file: File, filename: String) extends R var results = Seq[String]() val rs = query.executeQuery() - while(rs.next()) { + while (rs.next()) { results = rs.getString(1) +: results } conn.close() @@ -114,5 +117,6 @@ class DbHierarchy(db: ConnectionFactory, file: File, filename: String) extends R object DbHierarchy { /** Wrap an RRF file using a database to cache results. */ - def fromDatabase(db: ConnectionFactory, rrfFile: RRFFile) = new DbHierarchy(db, rrfFile.file, rrfFile.filename) -} \ No newline at end of file + def fromDatabase(db: ConnectionFactory, rrfFile: RRFFile) = + new DbHierarchy(db, rrfFile.file, rrfFile.filename) +} diff --git a/src/main/scala/org/renci/umls/rrf/RRFCols.scala b/src/main/scala/org/renci/umls/rrf/RRFCols.scala index 09874cd..5bc9117 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFCols.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFCols.scala @@ -23,26 +23,34 @@ class RRFCols(file: File, filename: String = "MRCOLS.RRF") extends RRFFile(file, // We'll just hard-code this for now. // Eventually, it'd be nice to have this automatically settable from MRCOLS.RRF itself, but // right now I just don't have the time. - rows.map(arr => Column( - arr(0), - arr(1), - arr(2), - arr(3).trim.toIntOption, - arr(4).trim.toFloatOption, - arr(5).trim.toIntOption, - arr(6), - arr(7) - )) + rows.map( + arr => + Column( + arr(0), + arr(1), + arr(2), + arr(3).trim.toIntOption, + arr(4).trim.toFloatOption, + arr(5).trim.toIntOption, + arr(6), + arr(7) + ) + ) } /** Retrieve a column by name. */ - def getColumn(name: String, filename: String): Seq[Column] = columns.filter(_.Filename == filename).filter(_.Name == name) + def getColumn(name: String, filename: String): Seq[Column] = + columns.filter(_.Filename == filename).filter(_.Name == name) def getOnlyColumn(name: String, filename: String): Column = { val results = getColumn(name, filename) if (results.size < 1) - throw new RuntimeException(s"No column named $name found for filename $filename in ${this.filename}") + throw new RuntimeException( + s"No column named $name found for filename $filename in ${this.filename}" + ) else if (results.size > 1) - throw new RuntimeException(s"Too many columns named $name found for filename $filename in ${this.filename}: $results") + throw new RuntimeException( + s"Too many columns named $name found for filename $filename in ${this.filename}: $results" + ) else results.head } } @@ -50,4 +58,4 @@ class RRFCols(file: File, filename: String = "MRCOLS.RRF") extends RRFFile(file, object RRFCols { /** Wrap an RRF file as an RRFCols. */ def fromRRF(rrfFile: RRFFile) = new RRFCols(rrfFile.file, rrfFile.filename) -} \ No newline at end of file +} diff --git a/src/main/scala/org/renci/umls/rrf/RRFConcepts.scala b/src/main/scala/org/renci/umls/rrf/RRFConcepts.scala index 026c35e..436281a 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFConcepts.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFConcepts.scala @@ -4,24 +4,24 @@ import java.io.File /** Represents a single column entry. */ case class Concept( - ConceptID: String, // CUI - Lang: String, // LAT - TermStatus: String, // TS - TermID: String, // LUI - StringType: String, // STT - StringID: String, // SUI - IsPreferred: Boolean, // ISPREF - AtomID: String, // AUI - SourceAtomID: String, // SAUI - SourceConceptID: String, // SCUI - SourceDescriptorID: String, // SDUI - Source: String, // SAB - TermType: String, // TTY - SourceEntryID: String, // CODE - EntryString: String, // STR - SourceRestriction: String, // SRL - SuppressibleFlag: String, // SUPPRESS - ContentViewFlag: String // CVF + ConceptID: String, // CUI + Lang: String, // LAT + TermStatus: String, // TS + TermID: String, // LUI + StringType: String, // STT + StringID: String, // SUI + IsPreferred: Boolean, // ISPREF + AtomID: String, // AUI + SourceAtomID: String, // SAUI + SourceConceptID: String, // SCUI + SourceDescriptorID: String, // SDUI + Source: String, // SAB + TermType: String, // TTY + SourceEntryID: String, // CODE + EntryString: String, // STR + SourceRestriction: String, // SRL + SuppressibleFlag: String, // SUPPRESS + ContentViewFlag: String // CVF ) /** @@ -33,33 +33,36 @@ class RRFConcepts(file: File, filename: String = "MRCONSO.RRF") extends RRFFile( // We'll just hard-code this for now. // Eventually, it'd be nice to have this automatically settable from MRFILES.RRF itself, but // right now I just don't have the time. - rows.map(arr => Concept( - arr(0), - arr(1), - arr(2), - arr(3), - arr(4), - arr(5), - arr(6).trim match { - case "Y" => true - case _ => false - }, - arr(7), - arr(8), - arr(9), - arr(10), - arr(12), - arr(13), - arr(14), - arr(15), - arr(16), - arr(17), - arr(18) - )) + rows.map( + arr => + Concept( + arr(0), + arr(1), + arr(2), + arr(3), + arr(4), + arr(5), + arr(6).trim match { + case "Y" => true + case _ => false + }, + arr(7), + arr(8), + arr(9), + arr(10), + arr(12), + arr(13), + arr(14), + arr(15), + arr(16), + arr(17), + arr(18) + ) + ) } } object RRFConcepts { /** Wrap an RRF file as an RRFCols. */ def fromRRF(rrfFile: RRFFile) = new RRFConcepts(rrfFile.file, rrfFile.filename) -} \ No newline at end of file +} diff --git a/src/main/scala/org/renci/umls/rrf/RRFDir.scala b/src/main/scala/org/renci/umls/rrf/RRFDir.scala index 063fe47..fb3448f 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFDir.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFDir.scala @@ -20,14 +20,19 @@ class RRFDir(dir: File, sqliteDbFile: File) { def getFile(filename: String): File = { val file = new File(dir, filename) - if (!file.exists()) throw new RuntimeException(s"Directory ${dir.getCanonicalPath} does not contain expected file $filename.") + if (!file.exists()) + throw new RuntimeException( + s"Directory ${dir.getCanonicalPath} does not contain expected file $filename." + ) file } def getRRFFile(filename: String): RRFFile = new RRFFile(getFile(filename), filename) /** Set up an SQLite database for us to use. */ - lazy val sqliteDb:DriverManagerConnectionFactory = new DriverManagerConnectionFactory("jdbc:sqlite:" + sqliteDbFile.getPath) + lazy val sqliteDb: DriverManagerConnectionFactory = new DriverManagerConnectionFactory( + "jdbc:sqlite:" + sqliteDbFile.getPath + ) /** Get the release information for this release (from release.dat) */ lazy val releaseInfo: String = Source.fromFile(getFile("release.dat")).mkString diff --git a/src/main/scala/org/renci/umls/rrf/RRFFile.scala b/src/main/scala/org/renci/umls/rrf/RRFFile.scala index 5409cd5..2aa5d65 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFFile.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFFile.scala @@ -11,7 +11,8 @@ import org.apache.commons.codec.digest.DigestUtils */ class RRFFile(val file: File, val filename: String) { /** A list of all rows in this file. */ - lazy val rows: Seq[IndexedSeq[String]] = Source.fromFile(file).getLines.map(_.split("\\|").toIndexedSeq).toSeq + lazy val rows: Seq[IndexedSeq[String]] = + Source.fromFile(file).getLines.map(_.split("\\|").toIndexedSeq).toSeq /** Count the number of rows in this file. */ lazy val rowCount: Long = Source.fromFile(file).getLines.size diff --git a/src/main/scala/org/renci/umls/rrf/RRFFiles.scala b/src/main/scala/org/renci/umls/rrf/RRFFiles.scala index 03f57e8..4f7db57 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFFiles.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFFiles.scala @@ -17,7 +17,8 @@ case class FileEntry( * The RRFFiles file contains metadata on all of the files in the RRFDir. This is essential, since this contains a * list of all the columns in the file. */ -class RRFFiles(file: File, cols: RRFCols, filename: String = "MRFILES.RRF") extends RRFFile(file, filename) { +class RRFFiles(file: File, cols: RRFCols, filename: String = "MRFILES.RRF") + extends RRFFile(file, filename) { /** Return a list of all files in an RRFFiles file. */ def files: Seq[FileEntry] = { // We'll just hard-code this for now. @@ -48,5 +49,6 @@ class RRFFiles(file: File, cols: RRFCols, filename: String = "MRFILES.RRF") exte object RRFFiles { /** Wrap an RRF file as an RRFFiles class. */ - def fromRRF(rrfFile: RRFFile, rrfCols: RRFCols) = new RRFFiles(rrfFile.file, rrfCols, rrfFile.filename) -} \ No newline at end of file + def fromRRF(rrfFile: RRFFile, rrfCols: RRFCols) = + new RRFFiles(rrfFile.file, rrfCols, rrfFile.filename) +} diff --git a/src/main/scala/org/renci/umls/rrf/RRFHierarchy.scala b/src/main/scala/org/renci/umls/rrf/RRFHierarchy.scala index f6d52d7..9c50168 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFHierarchy.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFHierarchy.scala @@ -4,15 +4,15 @@ import java.io.File /** Represents a single hierarchy entry. */ case class HierarchyEntry( - ConceptId: String, // CUI - AtomId: String, // AUI - ContextNumber: String, // CXN - ParentAtomId: String, // PAUI - Source: String, // SAB - Relation: String, // RELA - PathToRoot: String, // PTR - HierarchyCode: String, // HCD - ContentViewFlag: String // CVF + ConceptId: String, // CUI + AtomId: String, // AUI + ContextNumber: String, // CXN + ParentAtomId: String, // PAUI + Source: String, // SAB + Relation: String, // RELA + PathToRoot: String, // PTR + HierarchyCode: String, // HCD + ContentViewFlag: String // CVF ) /** @@ -24,25 +24,19 @@ class RRFHierarchy(file: File, filename: String = "MRHIER.RRF") extends RRFFile( // We'll just hard-code this for now. // Eventually, it'd be nice to have this automatically settable from MRCOLS.RRF itself, but // right now I just don't have the time. - rows.map(arr => HierarchyEntry( - arr(0), - arr(1), - arr(2), - arr(3), - arr(4), - arr(5), - arr(6), - arr(7), - arr(8) - )) + rows.map( + arr => HierarchyEntry(arr(0), arr(1), arr(2), arr(3), arr(4), arr(5), arr(6), arr(7), arr(8)) + ) } lazy val hierarchiesByAtomId = hierarchies.groupBy(_.AtomId) - def getParents(atomIds: Seq[String]): Set[String] = atomIds.flatMap(hierarchiesByAtomId.getOrElse(_, Seq())).map(_.ParentAtomId).toSet + def getParents(atomIds: Seq[String]): Set[String] = + atomIds.flatMap(hierarchiesByAtomId.getOrElse(_, Seq())).map(_.ParentAtomId).toSet def getOnlyParent(atomIds: Seq[String]): String = { val set = getParents(atomIds) if (set.size < 1) throw new RuntimeException(s"No parents found for atom IDs: $atomIds") - if (set.size > 1) throw new RuntimeException(s"Too many parents found for atom IDs: $atomIds: $set") + if (set.size > 1) + throw new RuntimeException(s"Too many parents found for atom IDs: $atomIds: $set") set.head } } @@ -50,4 +44,4 @@ class RRFHierarchy(file: File, filename: String = "MRHIER.RRF") extends RRFFile( object RRFHierarchy { /** Wrap an RRF file as an RRFHierarchy. */ def fromRRF(rrfFile: RRFFile) = new RRFHierarchy(rrfFile.file, rrfFile.filename) -} \ No newline at end of file +} From 5fce917ec66340858c127e5de1ffd9fac7a7538c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 12 May 2020 23:22:10 -0400 Subject: [PATCH 2/9] Replaced logback with scribe. --- build.sbt | 2 +- src/main/resources/logback.xml | 11 ------- .../scala/org/renci/umls/CodeMapper.scala | 33 +++++++++---------- .../scala/org/renci/umls/db/DbConcepts.scala | 22 ++++++------- .../scala/org/renci/umls/db/DbHierarchy.scala | 10 +++--- 5 files changed, 31 insertions(+), 47 deletions(-) delete mode 100644 src/main/resources/logback.xml diff --git a/build.sbt b/build.sbt index e00421b..ec00f07 100644 --- a/build.sbt +++ b/build.sbt @@ -35,7 +35,7 @@ libraryDependencies ++= { Seq( // Logging "com.typesafe.scala-logging" %% "scala-logging" % "3.9.2", - "ch.qos.logback" % "logback-classic" % "1.2.3", + "com.outr" %% "scribe" % "2.7.12", // Command line argument parsing. "org.rogach" %% "scallop" % "3.3.2", diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml deleted file mode 100644 index 961d6ab..0000000 --- a/src/main/resources/logback.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n - - - - - - - diff --git a/src/main/scala/org/renci/umls/CodeMapper.scala b/src/main/scala/org/renci/umls/CodeMapper.scala index 88f8ff9..57fed17 100644 --- a/src/main/scala/org/renci/umls/CodeMapper.scala +++ b/src/main/scala/org/renci/umls/CodeMapper.scala @@ -4,7 +4,6 @@ import java.io.{File, FileOutputStream, PrintStream} import org.rogach.scallop._ import org.rogach.scallop.exceptions._ -import com.typesafe.scalalogging.{LazyLogging, Logger} import org.renci.umls.rrf.RRFDir import scala.io.Source @@ -12,15 +11,15 @@ import scala.io.Source /** * Map terms from one code system to another. */ -object CodeMapper extends App with LazyLogging { +object CodeMapper extends App { /** * Command line configuration for CodeMapper. */ - class Conf(arguments: Seq[String], logger: Logger) extends ScallopConf(arguments) { + class Conf(arguments: Seq[String]) extends ScallopConf(arguments) { override def onError(e: Throwable): Unit = e match { case ScallopException(message) => printHelp - logger.error(message) + scribe.error(message) System.exit(1) case ex => super.onError(ex) } @@ -51,27 +50,27 @@ object CodeMapper extends App with LazyLogging { } // Parse command line arguments. - val conf = new Conf(args.toIndexedSeq, logger) + val conf = new Conf(args.toIndexedSeq) // Read RRF directory. val rrfDir = new RRFDir(conf.rrfDir(), conf.sqliteDb()) - logger.info(s"Loaded directory for release: ${rrfDir.releaseInfo}") - logger.info(s"Using SQLite backend: ${rrfDir.sqliteDb}") + scribe.info(s"Loaded directory for release: ${rrfDir.releaseInfo}") + scribe.info(s"Using SQLite backend: ${rrfDir.sqliteDb}") val concepts = rrfDir.concepts val sources = concepts.getSources if (conf.fromSource.isEmpty && conf.toSource.isEmpty) { - logger.info("Sources:") + scribe.info("Sources:") sources.map(entry => { - logger.info(s" - ${entry._1} (${entry._2} entries)") + scribe.info(s" - ${entry._1} (${entry._2} entries)") }) } else if (conf.fromSource.isEmpty) { // We know sourceTo is set. - logger.error(s"--source-from is empty, although --source-to is set to '${conf.toSource()}'") + scribe.error(s"--source-from is empty, although --source-to is set to '${conf.toSource()}'") } else if (conf.toSource.isEmpty) { // We know sourceFrom is set. - logger.error(s"--source-to is empty, although --source-from is set to '${conf.fromSource()}'") + scribe.error(s"--source-to is empty, although --source-from is set to '${conf.fromSource()}'") } else { // Do we need to filter first? @@ -94,7 +93,7 @@ object CodeMapper extends App with LazyLogging { }) } else { val ids = Source.fromFile(conf.idFile()).getLines.map(_.trim).toSeq - logger.info(s"Filtering to ${ids.size} IDs from ${conf.idFile()}.") + scribe.info(s"Filtering to ${ids.size} IDs from ${conf.idFile()}.") val halfMapByCode = concepts.getHalfMapsForCodes(conf.fromSource(), ids).groupBy(_.code) val map = concepts.getMap(conf.fromSource(), ids, conf.toSource(), Seq.empty) @@ -112,7 +111,7 @@ object CodeMapper extends App with LazyLogging { if (maps.nonEmpty) ("", Seq.empty) else { val termCuis = allTermCuis.getOrElse(id, Seq.empty) - // logger.info(s"Checking $termCuis for parent AUI information.") + // scribe.info(s"Checking $termCuis for parent AUI information.") val termAtomIds = concepts.getAUIsForCUIs(termCuis) val parentAtomIds = rrfDir.hierarchy.getParents(termAtomIds) @@ -145,7 +144,7 @@ object CodeMapper extends App with LazyLogging { count += 1 if (count % 100 == 0) { val percentage = count.toFloat / ids.size * 100 - logger.info(f"Processed $count out of ${ids.size} IDs ($percentage%.2f%%)") + scribe.info(f"Processed $count out of ${ids.size} IDs ($percentage%.2f%%)") } (maps, parentHalfMaps) @@ -158,11 +157,11 @@ object CodeMapper extends App with LazyLogging { val percentageTerm = (matchedTerm.size.toFloat / ids.size) * 100 val percentageParent = (matchedParent.size.toFloat / ids.size) * 100 val percentageTotal = (matchedTotal.size.toFloat / ids.size) * 100 - logger.info(f"Matched ${matchedTerm.size} IDs out of ${ids.size} ($percentageTerm%.2f%%)") - logger.info( + scribe.info(f"Matched ${matchedTerm.size} IDs out of ${ids.size} ($percentageTerm%.2f%%)") + scribe.info( f"Matched a further ${matchedParent.size} IDs via the parent term ($percentageParent%.2f%%)" ) - logger.info( + scribe.info( f"Total coverage: ${matchedTotal.size} IDs out of ${ids.size} ($percentageTotal%.2f%%)" ) } diff --git a/src/main/scala/org/renci/umls/db/DbConcepts.scala b/src/main/scala/org/renci/umls/db/DbConcepts.scala index a7be05b..4c2b81e 100644 --- a/src/main/scala/org/renci/umls/db/DbConcepts.scala +++ b/src/main/scala/org/renci/umls/db/DbConcepts.scala @@ -3,7 +3,6 @@ package org.renci.umls.db import java.io.File import java.sql.{Connection, PreparedStatement} -import com.typesafe.scalalogging.{LazyLogging, Logger} import org.apache.commons.dbcp2.ConnectionFactory import org.renci.umls.rrf @@ -20,8 +19,7 @@ import scala.io.Source /** A wrapper for RRFConcepts that uses SQLite */ class DbConcepts(db: ConnectionFactory, file: File, filename: String) - extends RRFConcepts(file, filename) - with LazyLogging { + extends RRFConcepts(file, filename) { implicit val halfMapCache: Cache[Seq[HalfMap]] = CaffeineCache[Seq[HalfMap]] /** The name of the table used to store this information. We include the SHA-256 hash so we reload it if it changes. */ @@ -35,9 +33,9 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) conn1.close() if (rowsFromDb > 0 && rowsFromDb == rowCount) { - logger.info(s"Concept table $tableName has $rowsFromDb rows.") + scribe.info(s"Concept table $tableName has $rowsFromDb rows.") } else { - logger.info(s"Concept table $tableName is not present or is out of sync. Regenerating.") + scribe.info(s"Concept table $tableName is not present or is out of sync. Regenerating.") val conn = db.createConnection() val regenerate = conn.createStatement() @@ -80,7 +78,7 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) count += 1 if (count % 100000 == 0) { val percentage = count.toFloat / rowCount * 100 - logger.info(f"Batched $count rows out of $rowCount ($percentage%.2f%%), executing.") + scribe.info(f"Batched $count rows out of $rowCount ($percentage%.2f%%), executing.") insertStmt.executeBatch() insertStmt.clearBatch() } @@ -127,7 +125,7 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) query.setString(1, source) val rs = query.executeQuery() - logger.info(s"Loading halfmaps for $source") + scribe.info(s"Loading halfmaps for $source") var halfMap = Seq[HalfMap]() var count = 0 while (rs.next()) { @@ -140,16 +138,16 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) ) +: halfMap count += 1 if (count % 100000 == 0) { - logger.info(s"Loaded $count halfmaps.") + scribe.info(s"Loaded $count halfmaps.") } } conn.close() - logger.info(s"${halfMap.size} halfmaps loaded.") + scribe.info(s"${halfMap.size} halfmaps loaded.") halfMap } else { - logger.info(s"Loading halfmaps for $source with identifiers: $ids.") + scribe.info(s"Loading halfmaps for $source with identifiers: $ids.") var halfMap = Seq[HalfMap]() var count = 0 @@ -181,11 +179,11 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) count += 1 } - logger.info(s"Loaded $count halfmaps.") + scribe.info(s"Loaded $count halfmaps.") }) conn.close() - logger.info(s"${halfMap.size} halfmaps loaded.") + scribe.info(s"${halfMap.size} halfmaps loaded.") halfMap } diff --git a/src/main/scala/org/renci/umls/db/DbHierarchy.scala b/src/main/scala/org/renci/umls/db/DbHierarchy.scala index 0e451cd..045d7ba 100644 --- a/src/main/scala/org/renci/umls/db/DbHierarchy.scala +++ b/src/main/scala/org/renci/umls/db/DbHierarchy.scala @@ -3,7 +3,6 @@ package org.renci.umls.db import java.io.File import java.sql.{Connection, PreparedStatement} -import com.typesafe.scalalogging.{LazyLogging, Logger} import org.apache.commons.dbcp2.ConnectionFactory import org.renci.umls.rrf @@ -28,8 +27,7 @@ case class HierarchyEntry( /** A wrapper for RRFHierarchy that uses SQLite */ class DbHierarchy(db: ConnectionFactory, file: File, filename: String) - extends RRFHierarchy(file, filename) - with LazyLogging { + extends RRFHierarchy(file, filename) { /** The name of the table used to store this information. We include the SHA-256 hash so we reload it if it changes. */ val tableName: String = "MRHIER_" + sha256 @@ -41,9 +39,9 @@ class DbHierarchy(db: ConnectionFactory, file: File, filename: String) conn1.close() if (rowsFromDb > 0 && rowsFromDb == rowCount) { - logger.info(s"Hierarchy table $tableName has $rowsFromDb rows.") + scribe.info(s"Hierarchy table $tableName has $rowsFromDb rows.") } else { - logger.info(s"Hierarchy table $tableName is not present or is out of sync. Regenerating.") + scribe.info(s"Hierarchy table $tableName is not present or is out of sync. Regenerating.") val conn = db.createConnection() val regenerate = conn.createStatement() @@ -77,7 +75,7 @@ class DbHierarchy(db: ConnectionFactory, file: File, filename: String) count += 1 if (count % 100000 == 0) { val percentage = count.toFloat / rowCount * 100 - logger.info(f"Batched $count rows out of $rowCount ($percentage%.2f%%), executing.") + scribe.info(f"Batched $count rows out of $rowCount ($percentage%.2f%%), executing.") insertStmt.executeBatch() insertStmt.clearBatch() } From 24d859a8801011cd93a24153015b8caad6221ce5 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 12 May 2020 23:25:04 -0400 Subject: [PATCH 3/9] Removed ExplicitResultTypes, which does not work in Scala 2.13. --- .scalafix.conf | 1 - 1 file changed, 1 deletion(-) diff --git a/.scalafix.conf b/.scalafix.conf index 5eff99f..436ec6c 100644 --- a/.scalafix.conf +++ b/.scalafix.conf @@ -1,5 +1,4 @@ rules = [ - ExplicitResultTypes, NoAutoTupling, RemoveUnused, DisableSyntax, From 89053673cfe1e2abb01c7045ef01d15af9938218 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 12 May 2020 23:25:47 -0400 Subject: [PATCH 4/9] Cleaned up code with Scalafix and Scalafmt. --- src/main/scala/org/renci/umls/db/DbConcepts.scala | 3 --- src/main/scala/org/renci/umls/db/DbHierarchy.scala | 3 --- 2 files changed, 6 deletions(-) diff --git a/src/main/scala/org/renci/umls/db/DbConcepts.scala b/src/main/scala/org/renci/umls/db/DbConcepts.scala index 4c2b81e..e786c05 100644 --- a/src/main/scala/org/renci/umls/db/DbConcepts.scala +++ b/src/main/scala/org/renci/umls/db/DbConcepts.scala @@ -1,10 +1,8 @@ package org.renci.umls.db import java.io.File -import java.sql.{Connection, PreparedStatement} import org.apache.commons.dbcp2.ConnectionFactory -import org.renci.umls.rrf import scala.util.Try import org.renci.umls.rrf._ @@ -14,7 +12,6 @@ import scalacache.memoization._ import scalacache.modes.sync._ import scala.concurrent.duration._ -import scala.collection.mutable import scala.io.Source /** A wrapper for RRFConcepts that uses SQLite */ diff --git a/src/main/scala/org/renci/umls/db/DbHierarchy.scala b/src/main/scala/org/renci/umls/db/DbHierarchy.scala index 045d7ba..1bf76a4 100644 --- a/src/main/scala/org/renci/umls/db/DbHierarchy.scala +++ b/src/main/scala/org/renci/umls/db/DbHierarchy.scala @@ -1,15 +1,12 @@ package org.renci.umls.db import java.io.File -import java.sql.{Connection, PreparedStatement} import org.apache.commons.dbcp2.ConnectionFactory -import org.renci.umls.rrf import scala.util.Try import org.renci.umls.rrf._ -import scala.collection.mutable import scala.io.Source /** Represents a single hierarchy entry. */ From 10422435fa56e3edc58f01fbcb59fd1fea512648 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 22 Jul 2020 16:32:02 -0400 Subject: [PATCH 5/9] Create sbt-test.yml Test code with `sbt test` and code style with Scalafmt. --- .github/workflows/sbt-test.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/sbt-test.yml diff --git a/.github/workflows/sbt-test.yml b/.github/workflows/sbt-test.yml new file mode 100644 index 0000000..d814314 --- /dev/null +++ b/.github/workflows/sbt-test.yml @@ -0,0 +1,22 @@ +name: Scala CI + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + - name: Run tests + run: sbt test + - name: Check code style with Scalafmt + uses: openlawteam/scalafmt-ci@v2.0.2 From a32901c79eac39c06a5c2b3d4e142ec2a9d8eec7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 22 Jul 2020 16:49:55 -0400 Subject: [PATCH 6/9] Updated .github scalafmt with `sbt scalafmtCheckAll`. Also upgraded scalafmt to latest version. --- .github/workflows/sbt-test.yml | 2 +- .scalafmt.conf | 2 +- project/plugins.sbt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/sbt-test.yml b/.github/workflows/sbt-test.yml index d814314..d8eeacb 100644 --- a/.github/workflows/sbt-test.yml +++ b/.github/workflows/sbt-test.yml @@ -19,4 +19,4 @@ jobs: - name: Run tests run: sbt test - name: Check code style with Scalafmt - uses: openlawteam/scalafmt-ci@v2.0.2 + run: sbt diff --git a/.scalafmt.conf b/.scalafmt.conf index 8b6b438..e0a952c 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,4 +1,4 @@ -version = "2.2.2" +version = "2.6.4" style = IntelliJ maxColumn = 100 align = some diff --git a/project/plugins.sbt b/project/plugins.sbt index 38163b8..186dd6c 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,4 +1,4 @@ // Code formatting and linting tools. -addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.0.1") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.0") addSbtPlugin("org.wartremover" % "sbt-wartremover" % "2.4.3") addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.11") From 791164ad98bef9667beaeb40759c86c00695edeb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 22 Jul 2020 16:53:23 -0400 Subject: [PATCH 7/9] Tweaked scalafmt settings and restyled code. --- .scalafmt.conf | 4 ++-- .../scala/org/renci/umls/CodeMapper.scala | 23 +++++++++++-------- .../scala/org/renci/umls/db/DbConcepts.scala | 1 + .../scala/org/renci/umls/db/DbHierarchy.scala | 2 ++ .../scala/org/renci/umls/rrf/RRFCols.scala | 2 ++ .../org/renci/umls/rrf/RRFConcepts.scala | 2 ++ .../scala/org/renci/umls/rrf/RRFFile.scala | 1 + .../scala/org/renci/umls/rrf/RRFFiles.scala | 2 ++ .../org/renci/umls/rrf/RRFHierarchy.scala | 2 ++ 9 files changed, 28 insertions(+), 11 deletions(-) diff --git a/.scalafmt.conf b/.scalafmt.conf index e0a952c..92d444a 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,4 +1,4 @@ version = "2.6.4" -style = IntelliJ +preset = IntelliJ +align.preset = some maxColumn = 100 -align = some diff --git a/src/main/scala/org/renci/umls/CodeMapper.scala b/src/main/scala/org/renci/umls/CodeMapper.scala index 57fed17..06f9a58 100644 --- a/src/main/scala/org/renci/umls/CodeMapper.scala +++ b/src/main/scala/org/renci/umls/CodeMapper.scala @@ -12,17 +12,19 @@ import scala.io.Source * Map terms from one code system to another. */ object CodeMapper extends App { + /** * Command line configuration for CodeMapper. */ class Conf(arguments: Seq[String]) extends ScallopConf(arguments) { - override def onError(e: Throwable): Unit = e match { - case ScallopException(message) => - printHelp - scribe.error(message) - System.exit(1) - case ex => super.onError(ex) - } + override def onError(e: Throwable): Unit = + e match { + case ScallopException(message) => + printHelp + scribe.error(message) + System.exit(1) + case ex => super.onError(ex) + } val version = getClass.getPackage.getImplementationVersion version(s"CodeMapper: map from one source to another (v$version)") @@ -125,8 +127,11 @@ object CodeMapper extends App { val codes = halfMaps.map(_.code).toSet val labels = halfMaps.map(_.label).toSet - (s"\t${cuis.mkString("|")}\t${sources.mkString("|")}\t${codes.mkString("|")}\t${labels - .mkString("|")}", halfMaps) + ( + s"\t${cuis.mkString("|")}\t${sources.mkString("|")}\t${codes.mkString("|")}\t${labels + .mkString("|")}", + halfMaps + ) } val halfMaps = halfMapByCode.getOrElse(id, Seq()) diff --git a/src/main/scala/org/renci/umls/db/DbConcepts.scala b/src/main/scala/org/renci/umls/db/DbConcepts.scala index e786c05..57ae484 100644 --- a/src/main/scala/org/renci/umls/db/DbConcepts.scala +++ b/src/main/scala/org/renci/umls/db/DbConcepts.scala @@ -330,6 +330,7 @@ class DbConcepts(db: ConnectionFactory, file: File, filename: String) } object DbConcepts { + /** Wrap an RRF file using a database to cache results. */ def fromDatabase(db: ConnectionFactory, rrfFile: RRFFile) = new DbConcepts(db, rrfFile.file, rrfFile.filename) diff --git a/src/main/scala/org/renci/umls/db/DbHierarchy.scala b/src/main/scala/org/renci/umls/db/DbHierarchy.scala index 1bf76a4..1db3a4a 100644 --- a/src/main/scala/org/renci/umls/db/DbHierarchy.scala +++ b/src/main/scala/org/renci/umls/db/DbHierarchy.scala @@ -25,6 +25,7 @@ case class HierarchyEntry( /** A wrapper for RRFHierarchy that uses SQLite */ class DbHierarchy(db: ConnectionFactory, file: File, filename: String) extends RRFHierarchy(file, filename) { + /** The name of the table used to store this information. We include the SHA-256 hash so we reload it if it changes. */ val tableName: String = "MRHIER_" + sha256 @@ -111,6 +112,7 @@ class DbHierarchy(db: ConnectionFactory, file: File, filename: String) } object DbHierarchy { + /** Wrap an RRF file using a database to cache results. */ def fromDatabase(db: ConnectionFactory, rrfFile: RRFFile) = new DbHierarchy(db, rrfFile.file, rrfFile.filename) diff --git a/src/main/scala/org/renci/umls/rrf/RRFCols.scala b/src/main/scala/org/renci/umls/rrf/RRFCols.scala index 5bc9117..556af5b 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFCols.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFCols.scala @@ -18,6 +18,7 @@ case class Column( * The RRFCols file contains metadata on all of the columns across all files in the RRFDir. */ class RRFCols(file: File, filename: String = "MRCOLS.RRF") extends RRFFile(file, filename) { + /** A list of all columns in an RRFCols file. */ val columns: Seq[Column] = { // We'll just hard-code this for now. @@ -56,6 +57,7 @@ class RRFCols(file: File, filename: String = "MRCOLS.RRF") extends RRFFile(file, } object RRFCols { + /** Wrap an RRF file as an RRFCols. */ def fromRRF(rrfFile: RRFFile) = new RRFCols(rrfFile.file, rrfFile.filename) } diff --git a/src/main/scala/org/renci/umls/rrf/RRFConcepts.scala b/src/main/scala/org/renci/umls/rrf/RRFConcepts.scala index 436281a..44c95f1 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFConcepts.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFConcepts.scala @@ -28,6 +28,7 @@ case class Concept( * The RRFConcepts file allows you to read concept data from MRCONSO.RRF. */ class RRFConcepts(file: File, filename: String = "MRCONSO.RRF") extends RRFFile(file, filename) { + /** A list of all columns in an RRFConcepts file. */ def concepts(): Seq[Concept] = { // We'll just hard-code this for now. @@ -63,6 +64,7 @@ class RRFConcepts(file: File, filename: String = "MRCONSO.RRF") extends RRFFile( } object RRFConcepts { + /** Wrap an RRF file as an RRFCols. */ def fromRRF(rrfFile: RRFFile) = new RRFConcepts(rrfFile.file, rrfFile.filename) } diff --git a/src/main/scala/org/renci/umls/rrf/RRFFile.scala b/src/main/scala/org/renci/umls/rrf/RRFFile.scala index 2aa5d65..f5cb081 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFFile.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFFile.scala @@ -10,6 +10,7 @@ import org.apache.commons.codec.digest.DigestUtils * Wraps a single RRF file. */ class RRFFile(val file: File, val filename: String) { + /** A list of all rows in this file. */ lazy val rows: Seq[IndexedSeq[String]] = Source.fromFile(file).getLines.map(_.split("\\|").toIndexedSeq).toSeq diff --git a/src/main/scala/org/renci/umls/rrf/RRFFiles.scala b/src/main/scala/org/renci/umls/rrf/RRFFiles.scala index 4f7db57..3a06457 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFFiles.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFFiles.scala @@ -19,6 +19,7 @@ case class FileEntry( */ class RRFFiles(file: File, cols: RRFCols, filename: String = "MRFILES.RRF") extends RRFFile(file, filename) { + /** Return a list of all files in an RRFFiles file. */ def files: Seq[FileEntry] = { // We'll just hard-code this for now. @@ -48,6 +49,7 @@ class RRFFiles(file: File, cols: RRFCols, filename: String = "MRFILES.RRF") } object RRFFiles { + /** Wrap an RRF file as an RRFFiles class. */ def fromRRF(rrfFile: RRFFile, rrfCols: RRFCols) = new RRFFiles(rrfFile.file, rrfCols, rrfFile.filename) diff --git a/src/main/scala/org/renci/umls/rrf/RRFHierarchy.scala b/src/main/scala/org/renci/umls/rrf/RRFHierarchy.scala index 9c50168..c4a37cd 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFHierarchy.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFHierarchy.scala @@ -19,6 +19,7 @@ case class HierarchyEntry( * The RRFHierarchy file contains hierarchy information on atoms in the system. */ class RRFHierarchy(file: File, filename: String = "MRHIER.RRF") extends RRFFile(file, filename) { + /** A list of all columns in an RRFCols file. */ lazy val hierarchies: Seq[HierarchyEntry] = { // We'll just hard-code this for now. @@ -42,6 +43,7 @@ class RRFHierarchy(file: File, filename: String = "MRHIER.RRF") extends RRFFile( } object RRFHierarchy { + /** Wrap an RRF file as an RRFHierarchy. */ def fromRRF(rrfFile: RRFFile) = new RRFHierarchy(rrfFile.file, rrfFile.filename) } From 5edccf463d7d28d869fe0c2df44c29ad68593190 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 22 Jul 2020 17:24:42 -0400 Subject: [PATCH 8/9] Fixed incomplete line in sbt-test. --- .github/workflows/sbt-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/sbt-test.yml b/.github/workflows/sbt-test.yml index d8eeacb..a4af28c 100644 --- a/.github/workflows/sbt-test.yml +++ b/.github/workflows/sbt-test.yml @@ -19,4 +19,4 @@ jobs: - name: Run tests run: sbt test - name: Check code style with Scalafmt - run: sbt + run: sbt scalafmtCheckAll From 88e1edd2d2f30d450e4a19a0e0532d1289b91c0c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 15 Apr 2020 13:59:24 -0400 Subject: [PATCH 9/9] Added code for loading MRMAP into SQLite. --- .../scala/org/renci/umls/db/DbMappings.scala | 106 ++++++++++++++++++ .../scala/org/renci/umls/rrf/RRFDir.scala | 5 +- .../org/renci/umls/rrf/RRFMappings.scala | 78 +++++++++++++ 3 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 src/main/scala/org/renci/umls/db/DbMappings.scala create mode 100644 src/main/scala/org/renci/umls/rrf/RRFMappings.scala diff --git a/src/main/scala/org/renci/umls/db/DbMappings.scala b/src/main/scala/org/renci/umls/db/DbMappings.scala new file mode 100644 index 0000000..2258201 --- /dev/null +++ b/src/main/scala/org/renci/umls/db/DbMappings.scala @@ -0,0 +1,106 @@ +package org.renci.umls.db + +import java.io.File +import java.sql.{Connection, PreparedStatement} + +import com.typesafe.scalalogging.{LazyLogging, Logger} +import org.apache.commons.dbcp2.ConnectionFactory +import org.renci.umls.rrf + +import scala.util.Try +import org.renci.umls.rrf._ +import scalacache._ +import scalacache.caffeine._ +import scalacache.memoization._ +import scalacache.modes.sync._ + +import scala.concurrent.duration._ +import scala.collection.mutable +import scala.io.Source + +/** A wrapper for RRFMappings that uses */ +class DbMappings(db: ConnectionFactory, file: File, filename: String) extends RRFMappings(file, filename) with LazyLogging { + /** The name of the table used to store this information. We include the SHA-256 hash so we reload it if it changes. */ + val tableName: String = "MRMAP_" + sha256 + + /* Check to see if the MRMAP_ table seems up to date. If not, load it into memory from the file. */ + val conn1 = db.createConnection() + val checkCount = conn1.createStatement() + val results = Try { checkCount.executeQuery(s"SELECT COUNT(*) AS cnt FROM $tableName") } + val rowsFromDb = if (results.isSuccess) results.get.getInt(1) else -1 + conn1.close() + + if (rowsFromDb > 0 && rowsFromDb == rowCount) { + logger.info(s"Mappings table $tableName has $rowsFromDb rows.") + } else { + logger.info(s"Mappings table $tableName is not present or is out of sync. Regenerating.") + + val conn = db.createConnection() + val regenerate = conn.createStatement() + regenerate.execute(s"DROP TABLE IF EXISTS $tableName") + regenerate.execute(s"""CREATE TABLE $tableName ( + |MAPSETCUI TEXT, + |MAPSETSAB TEXT, + |MAPSUBSETID TEXT, + |MAPRANK TEXT, + |MAPID TEXT, + |MAPSID TEXT, + |FROMID TEXT, + |FROMSID TEXT, + |FROMEXPR TEXT, + |FROMTYPE TEXT, + |FROMRULE TEXT, + |FROMRES TEXT, + |REL TEXT, + |RELA TEXT, + |TOID TEXT, + |TOSID TEXT, + |TOEXPR TEXT, + |TOTYPE TEXT, + |TORULE TEXT, + |TORES TEXT, + |MAPRULE TEXT, + |MAPRES TEXT, + |MAPTYPE TEXT, + |MAPATN TEXT, + |MAPATV TEXT, + |CVF TEXT + )""".stripMargin) + + val insertStmt = conn.prepareStatement( + s"INSERT INTO $tableName (MAPSETCUI, MAPSETSAB, MAPSUBSETID, MAPRANK, MAPID, MAPSID, FROMID, FROMSID, FROMEXPR, FROMTYPE, FROMRULE, FROMRES, REL, RELA, TOID, TOSID, TOEXPR, TOTYPE, TORULE, TORES, MAPRULE, MAPRES, MAPTYPE, MAPATN, MAPATV, CVF) " + + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" + ) + + var count = 0 + Source.fromFile(file).getLines.map(_.split("\\|", -1).toIndexedSeq) foreach { row => + insertStmt.clearParameters() + + (1 until 27) foreach ({ index => + insertStmt.setString(index, row(index - 1)) + }) + insertStmt.addBatch() + + count += 1 + if (count % 100000 == 0) { + val percentage = count.toFloat/rowCount*100 + logger.info(f"Batched $count rows out of $rowCount ($percentage%.2f%%), executing.") + insertStmt.executeBatch() + insertStmt.clearBatch() + } + } + insertStmt.executeBatch() + + // Add indexes. + regenerate.execute(s"CREATE INDEX INDEX_MRMAP_FROMID ON $tableName (FROMID);") + regenerate.execute(s"CREATE INDEX INDEX_MRMAP_TOID ON $tableName (TOID);") + regenerate.execute(s"CREATE INDEX INDEX_MRMAP_REL ON $tableName (REL);") + + conn.close() + } +} + +object DbMappings { + /** Wrap an RRF file using a database to cache results. */ + def fromDatabase(db: ConnectionFactory, rrfFile: RRFFile) = new DbMappings(db, rrfFile.file, rrfFile.filename) +} \ No newline at end of file diff --git a/src/main/scala/org/renci/umls/rrf/RRFDir.scala b/src/main/scala/org/renci/umls/rrf/RRFDir.scala index fb3448f..5cb0d3c 100644 --- a/src/main/scala/org/renci/umls/rrf/RRFDir.scala +++ b/src/main/scala/org/renci/umls/rrf/RRFDir.scala @@ -3,7 +3,7 @@ package org.renci.umls.rrf import java.io.File import org.apache.commons.dbcp2.DriverManagerConnectionFactory -import org.renci.umls.db.{DbConcepts, DbHierarchy} +import org.renci.umls.db.{DbConcepts, DbHierarchy, DbMappings} import scala.io.Source @@ -48,4 +48,7 @@ class RRFDir(dir: File, sqliteDbFile: File) { /** Loads MRCONSO.RRF files and makes them available. */ lazy val concepts: DbConcepts = DbConcepts.fromDatabase(sqliteDb, getRRFFile("MRCONSO.RRF")) + + /** Loads MRMAP.RRF files and makes them available. */ + val mappings: DbMappings = DbMappings.fromDatabase(sqliteDb, getRRFFile("MRMAP.RRF")) } diff --git a/src/main/scala/org/renci/umls/rrf/RRFMappings.scala b/src/main/scala/org/renci/umls/rrf/RRFMappings.scala new file mode 100644 index 0000000..721ec15 --- /dev/null +++ b/src/main/scala/org/renci/umls/rrf/RRFMappings.scala @@ -0,0 +1,78 @@ +package org.renci.umls.rrf + +import java.io.File + +/** Represents a single mapping entry. */ +case class UMLSMapping( + mapSet: String, // MAPSETCUI Unique identifier for the UMLS concept which represents the whole map set. + mapSetSource: String, // MAPSETSAB Source abbreviation (SAB) for the provider of the map set. + mapSubsetId: String, // MAPSUBSETID Map subset identifier used to identify a subset of related mappings within a map set. This is used for cases where the FROMEXPR may have more than one potential mapping (optional). + mapRank: String, // MAPRANK Order in which mappings in a subset should be applied. Used only where MAPSUBSETID is used. (optional) + mapId: String, // MAPID Unique identifier for this individual mapping. Primary key of this table to identify a particular row. + mapSourceId: String, // MAPSID Source asserted identifier for this mapping (optional). + fromId: String, // FROMID Identifier for the entity being mapped from. This is an internal UMLS identifier used to point to an external entity in a source vocabulary (represented by the FROMEXPR). When the source provides such an identifier, it is reused here. Otherwise, it is generated by NLM. The FROMID is only unique within a map set. It is not a pointer to UMLS entities like atoms or concepts. There is a one-to-one correlation between FROMID and a unique set of values in FROMSID, FROMEXPR, FROMTYPE, FROMRULE, and FROMRES within a map set. + fromSourceId: String, // FROMSID Source asserted identifier for the entity being mapped from (optional). + fromExpr: String, // FROMEXPR Entity being mapped from - can be a single code/identifier /concept name or a complex expression involving multiple codes/identifiers/concept names, Boolean operators and/or punctuation + fromType: String, // FROMTYPE Type of entity being mapped from. + fromRule: String, // FROMRULE Machine processable rule applicable to the entity being mapped from (optional) + fromRestriction: String, // FROMRES Restriction applicable to the entity being mapped from (optional). + relationship: String, // REL Relationship of the entity being mapped from to the entity being mapped to. + relationshipAdditionalLabel: String, // RELA Additional relationship label (optional). + toId: String, // TOID Identifier for the entity being mapped to. This is an internal identifier used to point to an external entity in a source vocabulary (represented by the TOEXPR). When the source provides such an identifier, it is reused here. Otherwise, it is generated by NLM. The TOID is only unique within a map set. It is not a pointer to UMLS entities like atoms or concepts. There is a one-to-one correlation between TOID and a unique set of values in TOSID, TOEXPR, TOTYPE, TORULE, TORES within a map set. + toSourceId: String, // TOSID Source asserted identifier for the entity being mapped to (optional). + toExpr: String, // TOEXPR Entity being mapped to - can be a single code/identifier/concept name or a complex expression involving multiple codes/identifiers/concept names, Boolean operators and/or punctuation. + toType: String, // TOTYPE Type of entity being mapped to. + toRule: String, // TORULE Machine processable rule applicable to the entity being mapped to (optional). + toRestriction: String, // TORES Restriction applicable to the entity being mapped to (optional). + mapRule: String, // MAPRULE Machine processable rule applicable to this mapping (optional). + mapRestriction: String, // MAPRES Restriction applicable to this mapping (optional). + mapType: String, // MAPTYPE Type of mapping (optional). + mapAttributeName: String, // MAPATN The name of the attribute associated with this mapping [not yet in use] + mapAttributeValue: String, // MAPATV The value of the attribute associated with this mapping [not yet in use] + contentViewFlag: String // CVF The Content View Flag is a bit field used to indicate membership in a content view. +) + +/** + * The RRFMappings file allows you to read concept data from MRMAP.RRF. + */ +class RRFMappings(file: File, filename: String = "MRMAP.RRF") extends RRFFile(file, filename) { + /** A list of all columns in an RRFMappings file. */ + def concepts(): Seq[UMLSMapping] = { + // We'll just hard-code this for now. + // Eventually, it'd be nice to have this automatically settable from MRFILES.RRF itself, but + // right now I just don't have the time. + rows.map(arr => UMLSMapping( + arr(0), + arr(1), + arr(2), + arr(3), + arr(4), + arr(5), + arr(6), + arr(7), + arr(8), + arr(9), + arr(10), + arr(12), + arr(13), + arr(14), + arr(15), + arr(16), + arr(17), + arr(18), + arr(19), + arr(20), + arr(21), + arr(22), + arr(23), + arr(24), + arr(25), + arr(26) + )) + } +} + +object RRFMappings { + /** Wrap an RRF file as an RRFCols. */ + def fromRRF(rrfFile: RRFFile) = new RRFMappings(rrfFile.file, rrfFile.filename) +} \ No newline at end of file