Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use mappings from MRMAP #2

Draft
wants to merge 10 commits into
base: access-mrconso-via-sqlite
Choose a base branch
from
Draft
22 changes: 22 additions & 0 deletions .github/workflows/sbt-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Scala CI

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Set up JDK 1.8
uses: actions/setup-java@v1
with:
java-version: 1.8
- name: Run tests
run: sbt test
- name: Check code style with Scalafmt
run: sbt scalafmtCheckAll
1 change: 0 additions & 1 deletion .scalafix.conf
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
rules = [
ExplicitResultTypes,
NoAutoTupling,
RemoveUnused,
DisableSyntax,
Expand Down
6 changes: 3 additions & 3 deletions .scalafmt.conf
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version = "2.2.2"
style = IntelliJ
version = "2.6.4"
preset = IntelliJ
align.preset = some
maxColumn = 100
align = some
2 changes: 1 addition & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ libraryDependencies ++= {
Seq(
// Logging
"com.typesafe.scala-logging" %% "scala-logging" % "3.9.2",
"ch.qos.logback" % "logback-classic" % "1.2.3",
"com.outr" %% "scribe" % "2.7.12",

// Command line argument parsing.
"org.rogach" %% "scallop" % "3.3.2",
Expand Down
2 changes: 1 addition & 1 deletion project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Code formatting and linting tools.
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.0.1")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.0")
addSbtPlugin("org.wartremover" % "sbt-wartremover" % "2.4.3")
addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.11")
11 changes: 0 additions & 11 deletions src/main/resources/logback.xml

This file was deleted.

133 changes: 73 additions & 60 deletions src/main/scala/org/renci/umls/CodeMapper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,27 @@ import java.io.{File, FileOutputStream, PrintStream}

import org.rogach.scallop._
import org.rogach.scallop.exceptions._
import com.typesafe.scalalogging.{LazyLogging, Logger}
import org.renci.umls.rrf.RRFDir

import scala.io.Source

/**
* Map terms from one code system to another.
*/
object CodeMapper extends App with LazyLogging {
object CodeMapper extends App {

/**
* Command line configuration for CodeMapper.
*/
class Conf(arguments: Seq[String], logger: Logger) extends ScallopConf(arguments) {
override def onError(e: Throwable): Unit = e match {
case ScallopException(message) =>
printHelp
logger.error(message)
System.exit(1)
case ex => super.onError(ex)
}
class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
override def onError(e: Throwable): Unit =
e match {
case ScallopException(message) =>
printHelp
scribe.error(message)
System.exit(1)
case ex => super.onError(ex)
}

val version = getClass.getPackage.getImplementationVersion
version(s"CodeMapper: map from one source to another (v$version)")
Expand All @@ -38,52 +39,47 @@ object CodeMapper extends App with LazyLogging {
default = Some(new File("./sqlite.db"))
)

val fromSource: ScallopOption[String] = opt[String](
descr = "The source to translate from"
)
val fromSource: ScallopOption[String] = opt[String](descr = "The source to translate from")

val toSource: ScallopOption[String] = opt[String](
descr = "The source to translate to"
)
val toSource: ScallopOption[String] = opt[String](descr = "The source to translate to")

val idFile: ScallopOption[File] = opt[File](
descr = "A file containing identifiers (in a single, newline-delimited column)"
)
val idFile: ScallopOption[File] =
opt[File](descr = "A file containing identifiers (in a single, newline-delimited column)")

val outputFile: ScallopOption[File] = opt[File](
descr = "Where to write the output file"
)
val outputFile: ScallopOption[File] = opt[File](descr = "Where to write the output file")

verify()
}

// Parse command line arguments.
val conf = new Conf(args.toIndexedSeq, logger)
val conf = new Conf(args.toIndexedSeq)

// Read RRF directory.
val rrfDir = new RRFDir(conf.rrfDir(), conf.sqliteDb())
logger.info(s"Loaded directory for release: ${rrfDir.releaseInfo}")
logger.info(s"Using SQLite backend: ${rrfDir.sqliteDb}")
scribe.info(s"Loaded directory for release: ${rrfDir.releaseInfo}")
scribe.info(s"Using SQLite backend: ${rrfDir.sqliteDb}")

val concepts = rrfDir.concepts
val sources = concepts.getSources

if (conf.fromSource.isEmpty && conf.toSource.isEmpty) {
logger.info("Sources:")
scribe.info("Sources:")
sources.map(entry => {
logger.info(s" - ${entry._1} (${entry._2} entries)")
scribe.info(s" - ${entry._1} (${entry._2} entries)")
})
} else if (conf.fromSource.isEmpty) {
// We know sourceTo is set.
logger.error(s"--source-from is empty, although --source-to is set to '${conf.toSource()}'")
scribe.error(s"--source-from is empty, although --source-to is set to '${conf.toSource()}'")
} else if (conf.toSource.isEmpty) {
// We know sourceFrom is set.
logger.error(s"--source-to is empty, although --source-from is set to '${conf.fromSource()}'")
scribe.error(s"--source-to is empty, although --source-from is set to '${conf.fromSource()}'")
} else {
// Do we need to filter first?

// Get ready to write output!
val stream = if (conf.outputFile.isEmpty) System.out else new PrintStream(new FileOutputStream(conf.outputFile()))
val stream =
if (conf.outputFile.isEmpty) System.out
else new PrintStream(new FileOutputStream(conf.outputFile()))

// Both sourceFrom and sourceTo are set!
if (conf.idFile.isEmpty) {
Expand All @@ -92,55 +88,68 @@ object CodeMapper extends App with LazyLogging {
maps.foreach(map => {
stream.println(
s"${map.fromSource}\t${map.fromCode}\t" +
s"${map.toSource}\t${map.toCode}\t" +
s"${map.conceptIds.mkString(", ")}\t" +
s"${map.labels.mkString("|")}"
s"${map.toSource}\t${map.toCode}\t" +
s"${map.conceptIds.mkString(", ")}\t" +
s"${map.labels.mkString("|")}"
)
})
} else {
val ids = Source.fromFile(conf.idFile()).getLines.map(_.trim).toSeq
logger.info(s"Filtering to ${ids.size} IDs from ${conf.idFile()}.")
scribe.info(s"Filtering to ${ids.size} IDs from ${conf.idFile()}.")

val halfMapByCode = concepts.getHalfMapsForCodes(conf.fromSource(), ids).groupBy(_.code)
val map = concepts.getMap(conf.fromSource(), ids, conf.toSource(), Seq.empty)
val allTermCuis = concepts.getCUIsForCodes(conf.fromSource(), ids)

stream.println("fromSource\tid\tcuis\tlabels\tcountDirect\tcountViaParent\ttoIds\ttoLabels\tparentCuis\tparentSource\tparentIds\tparentLabels")
stream.println(
"fromSource\tid\tcuis\tlabels\tcountDirect\tcountViaParent\ttoIds\ttoLabels\tparentCuis\tparentSource\tparentIds\tparentLabels"
)

var count = 0
val mapByFromId = map.groupBy(_.fromCode)
val matched = ids.map(id => {
val maps = mapByFromId.getOrElse(id, Seq())
val (parentStr, parentHalfMaps) = if (maps.nonEmpty) ("", Seq.empty) else {
val termCuis = allTermCuis.getOrElse(id, Seq.empty)
// logger.info(s"Checking $termCuis for parent AUI information.")

val termAtomIds = concepts.getAUIsForCUIs(termCuis)
val parentAtomIds = rrfDir.hierarchy.getParents(termAtomIds)
val parentCUIs = concepts.getCUIsForAUI(parentAtomIds.toSeq)
val halfMaps = if(parentCUIs.isEmpty) Seq.empty else concepts.getMapsByCUIs(parentCUIs.toSeq, conf.toSource())

val cuis = halfMaps.map(_.cui).toSet
val sources = halfMaps.map(_.source).toSet
val codes = halfMaps.map(_.code).toSet
val labels = halfMaps.map(_.label).toSet

(s"\t${cuis.mkString("|")}\t${sources.mkString("|")}\t${codes.mkString("|")}\t${labels.mkString("|")}", halfMaps)
}
val (parentStr, parentHalfMaps) =
if (maps.nonEmpty) ("", Seq.empty)
else {
val termCuis = allTermCuis.getOrElse(id, Seq.empty)
// scribe.info(s"Checking $termCuis for parent AUI information.")

val termAtomIds = concepts.getAUIsForCUIs(termCuis)
val parentAtomIds = rrfDir.hierarchy.getParents(termAtomIds)
val parentCUIs = concepts.getCUIsForAUI(parentAtomIds.toSeq)
val halfMaps =
if (parentCUIs.isEmpty) Seq.empty
else concepts.getMapsByCUIs(parentCUIs.toSeq, conf.toSource())

val cuis = halfMaps.map(_.cui).toSet
val sources = halfMaps.map(_.source).toSet
val codes = halfMaps.map(_.code).toSet
val labels = halfMaps.map(_.label).toSet

(
s"\t${cuis.mkString("|")}\t${sources.mkString("|")}\t${codes.mkString("|")}\t${labels
.mkString("|")}",
halfMaps
)
}

val halfMaps = halfMapByCode.getOrElse(id, Seq())

stream.println(
s"${conf.fromSource()}\t$id\t${halfMaps.map(_.cui).toSet.mkString("|")}\t${halfMaps.map(_.label).toSet.mkString("|")}\t${maps.size}\t${parentHalfMaps.size}"
s"${conf.fromSource()}\t$id\t${halfMaps.map(_.cui).toSet.mkString("|")}\t${halfMaps
.map(_.label)
.toSet
.mkString("|")}\t${maps.size}\t${parentHalfMaps.size}"
+ s"\t${maps.map(m => m.toSource + ":" + m.toCode).mkString("|")}"
+ s"\t${maps.map(_.labels.mkString(";")).mkString("|")}"
+ s"$parentStr"
)

count += 1
if (count % 100 == 0) {
val percentage = count.toFloat/ids.size * 100
logger.info(f"Processed $count out of ${ids.size} IDs ($percentage%.2f%%)")
val percentage = count.toFloat / ids.size * 100
scribe.info(f"Processed $count out of ${ids.size} IDs ($percentage%.2f%%)")
}

(maps, parentHalfMaps)
Expand All @@ -150,12 +159,16 @@ object CodeMapper extends App with LazyLogging {
val matchedParent = matched.filter(_._2.nonEmpty).flatMap(_._2)
val matchedTotal = matched.filter(m => m._1.nonEmpty || m._2.nonEmpty)

val percentageTerm = (matchedTerm.size.toFloat/ids.size) * 100
val percentageParent = (matchedParent.size.toFloat/ids.size) * 100
val percentageTotal = (matchedTotal.size.toFloat/ids.size) * 100
logger.info(f"Matched ${matchedTerm.size} IDs out of ${ids.size} ($percentageTerm%.2f%%)")
logger.info(f"Matched a further ${matchedParent.size} IDs via the parent term ($percentageParent%.2f%%)")
logger.info(f"Total coverage: ${matchedTotal.size} IDs out of ${ids.size} ($percentageTotal%.2f%%)")
val percentageTerm = (matchedTerm.size.toFloat / ids.size) * 100
val percentageParent = (matchedParent.size.toFloat / ids.size) * 100
val percentageTotal = (matchedTotal.size.toFloat / ids.size) * 100
scribe.info(f"Matched ${matchedTerm.size} IDs out of ${ids.size} ($percentageTerm%.2f%%)")
scribe.info(
f"Matched a further ${matchedParent.size} IDs via the parent term ($percentageParent%.2f%%)"
)
scribe.info(
f"Total coverage: ${matchedTotal.size} IDs out of ${ids.size} ($percentageTotal%.2f%%)"
)
}

stream.close()
Expand Down
Loading