Skip to content

Commit

Permalink
build: create the MiRTarBase parser for .xlsx files, #TASK-5576, #TAS…
Browse files Browse the repository at this point in the history
…K-5564

  - This parser skips possible errors in the .xlsx file
  - No need to use the script fix-gene-symbol.sh (so it is removed)
  - Update configuration files (by removing comments about the fix-gene-symbol.sh references)
  - Add JUnit test for the MiRTarBase parser

On branch TASK-5564
Changes to be committed:
	deleted:    cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh
	modified:   cellbase-core/src/main/resources/configuration.yml
	modified:   cellbase-core/src/test/resources/configuration.yml
	modified:   cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java
	modified:   cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java
	new file:   cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/MiRTarBaseIndexer.java
	modified:   cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java
	new file:   cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexerTest.java
	modified:   cellbase-lib/src/test/resources/configuration.test.yaml
  • Loading branch information
jtarraga committed Jul 4, 2024
1 parent 19efdf4 commit fcbb680
Show file tree
Hide file tree
Showing 9 changed files with 227 additions and 190 deletions.
60 changes: 0 additions & 60 deletions cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh

This file was deleted.

2 changes: 0 additions & 2 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,6 @@ download:
host: https://mirtarbase.cuhk.edu.cn/
version: "9.0"
files:
# This file contains errors and has to be fixed before building
# check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh
MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx
MMUSCULUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/mmu_MTI.xlsx
RNORVEGICUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/rno_MTI.xlsx
Expand Down
2 changes: 0 additions & 2 deletions cellbase-core/src/test/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,6 @@ download:
host: https://mirtarbase.cuhk.edu.cn/
version: "9.0"
files:
# This file contains errors and has to be fixed before building
# check the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbol.sh
MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx
MMUSCULUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/mmu_MTI.xlsx
RNORVEGICUS_MIRTARBASE: ~miRTarBase/miRTarBase_2022/cache/download/9.0/rno_MTI.xlsx
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;

import static org.opencb.cellbase.lib.EtlCommons.*;

Expand Down Expand Up @@ -167,27 +166,7 @@ public void check() throws Exception {
miRBaseFile = checkFiles(MIRBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath();

// mirtarbase
// The downloaded .xlsx file contains errors and it has to be fixed manually
logger.info("Checking {} folder and files", getDataName(MIRTARBASE_DATA));
Path downloadRegulationPath = downloadPath.getParent().getParent().resolve(REGULATION_DATA);
List<String> mirTarBaseFiles = ((DataSource) dataSourceReader.readValue(downloadRegulationPath.resolve(
getDataVersionFilename(MIRTARBASE_DATA)).toFile())).getUrls().stream().map(u -> Paths.get(u).getFileName().toString())
.collect(Collectors.toList());
if (mirTarBaseFiles.size() != 1) {
throw new CellBaseException("One " + getDataName(MIRTARBASE_DATA) + " file is expected at " + downloadRegulationPath
+ ", but currently there are " + mirTarBaseFiles.size() + " files");
}
// The hsa_MIT.xlsx is fixed and converted to hsa_MIT.csv manually
if (!mirTarBaseFiles.get(0).endsWith(XLSX_EXTENSION)) {
throw new CellBaseException("A " + XLSX_EXTENSION + " " + getDataName(MIRTARBASE_DATA) + " file is expected at "
+ downloadRegulationPath + ", but currently it is named " + mirTarBaseFiles.get(0));
}
miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION));
if (!Files.exists(miRTarBaseFile)) {
throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist. You"
+ " have to export the file " + mirTarBaseFiles.get(0) + " to " + miRTarBaseFile.getFileName() + " format separated by"
+ " tabs and then execute the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbols.sh");
}
miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath();

// Check genome FASTA file
Path genomeDownloadPath = downloadPath.getParent().getParent().resolve(GENOME_DATA);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@
import org.opencb.biodata.formats.sequence.fasta.Fasta;
import org.opencb.biodata.formats.sequence.fasta.io.FastaReader;
import org.opencb.biodata.models.clinical.ClinicalProperty;
import org.opencb.biodata.models.core.*;
import org.opencb.biodata.models.core.CancerHotspot;
import org.opencb.biodata.models.core.CancerHotspotVariant;
import org.opencb.biodata.models.core.GeneCancerAssociation;
import org.opencb.biodata.models.core.MirnaTarget;
import org.opencb.biodata.models.variant.avro.GeneDrugInteraction;
import org.opencb.biodata.models.variant.avro.GeneTraitAssociation;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.commons.utils.FileUtils;
import org.rocksdb.Options;
import org.rocksdb.RocksDB;
Expand All @@ -38,12 +40,12 @@
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;

import static org.opencb.cellbase.lib.EtlCommons.*;
import static org.opencb.cellbase.lib.EtlCommons.DISGENET_DATA;
import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA;
import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE;
import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE;

Expand Down Expand Up @@ -608,84 +610,15 @@ protected void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOE
}
}

protected void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException, CellBaseException {
logger.info(PARSING_LOG_MESSAGE, miRTarBaseFile);

try (BufferedReader reader = Files.newBufferedReader(miRTarBaseFile)) {
String line;
// Skip header line
reader.readLine();

String currentMiRTarBaseId = null;
String currentMiRNA = null;
String currentGene = null;
List<TargetGene> targetGenes = new ArrayList<>();
Map<String, List<MirnaTarget>> geneToMirna = new HashMap<>();

while ((line = reader.readLine()) != null) {
String[] field = line.split("\t", -1);
if (field.length != 9) {
throw new CellBaseException("Invalid number of columns " + field.length + " (expected 9 columns) parsing file "
+ miRTarBaseFile + ". Line: " + line);
}

// #0: miRTarBase ID
String miRTarBaseId = field[0];
if (currentMiRTarBaseId == null) {
currentMiRTarBaseId = miRTarBaseId;
}

// #1: miRNA
String miRNA = field[1];
if (currentMiRNA == null) {
currentMiRNA = miRNA;
}

// #2: Species (miRNA)

// #3: Target Gene
String geneName = field[3];
if (currentGene == null) {
currentGene = geneName;
}

// #4: Target Gene (Entrez ID)
// #5: Species (Target Gene)

if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) {
// new entry, store current one
MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, "miRTarBase", currentMiRNA, targetGenes);
addValueToMapElement(geneToMirna, currentGene, miRnaTarget);
targetGenes = new ArrayList<>();
currentGene = geneName;
currentMiRTarBaseId = miRTarBaseId;
currentMiRNA = miRNA;
}

// #6: Experiments
String experiment = field[6];

// #7: Support Type
String supportType = field[7];

// #8: pubmed
String pubmed = field[8];

targetGenes.add(new TargetGene(experiment, supportType, pubmed));
}

// parse last entry
MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes);
addValueToMapElement(geneToMirna, currentGene, miRnaTarget);

for (Map.Entry<String, List<MirnaTarget>> entry : geneToMirna.entrySet()) {
rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue());
}
protected void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException {
MiRTarBaseIndexer miRTarBaseIndexer = new MiRTarBaseIndexer();
Map<String, List<MirnaTarget>> result = miRTarBaseIndexer.index(miRTarBaseFile);
for (Map.Entry<String, List<MirnaTarget>> entry : result.entrySet()) {
rocksDbManager.update(rocksdb, entry.getKey() + MIRTARBASE_SUFFIX, entry.getValue());
}
logger.info(PARSING_DONE_LOG_MESSAGE, miRTarBaseFile);
}

protected static <T> void addValueToMapElement(Map<String, List<T>> map, String key, T value) {
public static <T> void addValueToMapElement(Map<String, List<T>> map, String key, T value) {
if (map.containsKey(key)) {
map.get(key).add(value);
} else {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
* Copyright 2015-2020 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.opencb.cellbase.lib.builders;

import org.apache.commons.lang3.StringUtils;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.opencb.biodata.models.core.MirnaTarget;
import org.opencb.biodata.models.core.TargetGene;
import org.opencb.commons.utils.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.opencb.cellbase.lib.EtlCommons.MIRTARBASE_DATA;
import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_DONE_LOG_MESSAGE;
import static org.opencb.cellbase.lib.builders.CellBaseBuilder.PARSING_LOG_MESSAGE;

public class MiRTarBaseIndexer {

protected Logger logger;

public MiRTarBaseIndexer() {
logger = LoggerFactory.getLogger(this.getClass());
}

public Map<String, List<MirnaTarget>> index(Path miRTarBaseFile) throws IOException {
FileUtils.checkFile(miRTarBaseFile);

logger.info(PARSING_LOG_MESSAGE, miRTarBaseFile);

Map<String, List<MirnaTarget>> geneToMirna = new HashMap<>();

try (InputStream fis = new FileInputStream(miRTarBaseFile.toFile());
Workbook workbook = new XSSFWorkbook(fis)) {

// Get the first sheet
Sheet sheet = workbook.getSheetAt(0);

String currentMiRTarBaseId = null;
String currentMiRNA = null;
String currentGene = null;
List<TargetGene> targetGenes = new ArrayList<>();

for (int rowNum = sheet.getFirstRowNum() + 1; rowNum <= sheet.getLastRowNum(); rowNum++) {
Row row = sheet.getRow(rowNum);

// Sanity check
if (row.getPhysicalNumberOfCells() != 9) {
logger.warn("Error parsing line {}: invalid number of columns {} (expected 9 columns). Line {}.",
rowNum + 1, row.getPhysicalNumberOfCells());
continue;
}

if (row.getCell(0).getCellType() != CellType.STRING || row.getCell(0).getStringCellValue() == null
|| row.getCell(1).getCellType() != CellType.STRING || row.getCell(1).getStringCellValue() == null
|| row.getCell(3).getCellType() != CellType.STRING || row.getCell(3).getStringCellValue() == null) {
logger.warn("Error parsing line {}: mandatory fields(miRTarBase ID, miRNA, Target Gene) are empty or wrong cell type.",
rowNum + 1);
continue;
}

// #0: miRTarBase ID
Cell cell = row.getCell(0);
String miRTarBaseId = cell.getStringCellValue();
if (currentMiRTarBaseId == null) {
currentMiRTarBaseId = miRTarBaseId;
}

// #1: miRNA
cell = row.getCell(1);
String miRNA = cell.getStringCellValue();
if (currentMiRNA == null) {
currentMiRNA = miRNA;
}

// #2: Species (miRNA)

// #3: Target Gene
cell = row.getCell(3);
String geneName = cell.getStringCellValue();
if (currentGene == null) {
currentGene = geneName;
}

// #4: Target Gene (Entrez ID)
// #5: Species (Target Gene)

if (!miRTarBaseId.equals(currentMiRTarBaseId) || !geneName.equals(currentGene)) {
// new entry, store current one
MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes);
GeneBuilderIndexer.addValueToMapElement(geneToMirna, currentGene, miRnaTarget);
targetGenes = new ArrayList<>();
currentGene = geneName;
currentMiRTarBaseId = miRTarBaseId;
currentMiRNA = miRNA;
}

// #6: Experiments
cell = row.getCell(6);
String experiment = (cell.getCellType() == CellType.STRING ? cell.getStringCellValue() : null);

// #7: Support Type
cell = row.getCell(7);
String supportType = (cell.getCellType() == CellType.STRING ? cell.getStringCellValue() : null);

// #8: pubmed
cell = row.getCell(8);
String pubmed = new BigDecimal(cell.getNumericCellValue()).toString();

if (StringUtils.isNotEmpty(experiment) || StringUtils.isNotEmpty(supportType) || StringUtils.isNotEmpty(pubmed)) {
targetGenes.add(new TargetGene(experiment, supportType, pubmed));
}
}

// parse last entry
MirnaTarget miRnaTarget = new MirnaTarget(currentMiRTarBaseId, MIRTARBASE_DATA, currentMiRNA, targetGenes);
GeneBuilderIndexer.addValueToMapElement(geneToMirna, currentGene, miRnaTarget);

}
logger.info(PARSING_DONE_LOG_MESSAGE, miRTarBaseFile);

return geneToMirna;
}
}
Loading

0 comments on commit fcbb680

Please sign in to comment.