Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Xinpeng021001 authored Jul 28, 2024
1 parent af05751 commit 3c43a75
Show file tree
Hide file tree
Showing 6 changed files with 2,108 additions and 0 deletions.
Empty file added __init__.py
Empty file.
276 changes: 276 additions & 0 deletions cgc_process_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
##########################
# to generate json file for all cgc_stardard.out file from run_dbcan
# use: python cgc_process_json.py -i cgc_standard.out -o cgc_standard.out.json
# written by Roland Madadjim in Cui's lab at Soc, UNL
# last updated: 12/09/2022
##########################

# from __future__ import print_function
import argparse
import json
import os

import numpy as np
import pandas as pd


class PrePro:
"""
A class for preprocessing and organizing genetic data.
Attributes
----------
df (DataFrame): The initial data provided to the class.
Args:
data (DataFrame): The data to be processed.
"""

def __init__(self, data):
self.df = data

def extract_gs(self, dataList):
"""
Extracts gene strings from the provided data list.
This method processes each entry in the dataList, extracting and concatenating specific gene-related information based on gene types like CAZyme, TC, TF, STP, and Null_Type.
Args:
dataList (list): A list of dictionaries, each containing gene and protein information.
Returns
-------
str: A string representing concatenated gene information.
"""
i = 0
geneL = []
# gene = list(map(lambda e: "{}".format(e["Gene_Type"]), dataList))
# pfam = list(map(lambda e: "{}".format(e["Protein_Family"]), dataList))
gene = ["{}".format(e["Gene_Type"]) for e in dataList]
pfam = ["{}".format(e["Protein_Family"]) for e in dataList]
while i < len(dataList):
if gene[i] == "CAZyme":
s = pfam[i]
geneL.append(s)
elif gene[i] == "TC":
s = pfam[i]
geneL.append(s)
elif gene[i] == "TF":
s = pfam[i]
geneL.append(s)
elif gene[i] == "STP":
s = pfam[i]
geneL.append(s)
elif gene[i] == "Null_Type":
s = "NA"
geneL.append(s)
i = i + 1
gene_st = "-".join(geneL)
return gene_st

def pul_section(self):
"""
Processes the dataframe and yields structured genetic data.
This method groups the dataframe by 'cgc_id' and processes each group to yield a dictionary containing detailed genetic information for each 'cgc_id'.
Yields
------
dict: A dictionary containing genetic and protein information structured by 'cgc_id'.
"""
for (cgc_id), df_pul_grouped in self.df.groupby("cgc_id"):
datalist = list(self.cluster_section(df_pul_grouped))
gene_str = self.extract_gs(datalist)
yield {
cgc_id: {
"changelog": [],
"Cluster_ID": cgc_id,
"Gene_String": gene_str,
"Contig_ID": self.df.loc[self.df["cgc_id"] == cgc_id, "contig_id"].iloc[0],
"ncbi_species_tax_id": [],
"organism_name": [],
"publication": [],
"Protein": list(self.cluster_section(df_pul_grouped)),
# "dbCan_Pul_accession": ID, # as string or integer?
# "publication": df.loc[df['ID'] == ID, 'PMID'].iloc[0],
}
}

def cluster_section(self, df_pul_grouped):
"""
Yields structured data for each gene cluster.
This method iterates over grouped dataframe and yields a dictionary containing detailed information about each gene cluster.
Args:
df_pul_grouped (DataFrame): The grouped dataframe by specific gene identifiers.
Yields
------
dict: A dictionary containing information about a gene cluster.
"""
for (
contig_id,
gene_type,
protein_id,
gene_start,
gene_stop,
direction,
protein_family,
), _ in df_pul_grouped.groupby(
[
"contig_id",
"gene_type",
"protein_id",
"gene_start",
"gene_stop",
"direction",
"protein_family",
]
):
yield {
"contig_id": contig_id,
"protein_id": protein_id,
"Gene_Type": gene_type,
"Gene_Start": gene_start,
"Gene_Stop": gene_stop,
"Strand": direction,
"Protein_Family": protein_family,
}

def run_dbCan_section(self, df_puls):
"""
Processes the dataframe and yields gene and protein information.
This method iterates over each row of the dataframe and yields a dictionary containing gene and protein information.
Currently, this method is a stub and needs to be implemented.
Args:
df_puls (DataFrame): The dataframe containing gene and protein data.
Yields
------
dict: A dictionary containing gene and protein information.
"""
for _ in df_puls.itertuples():
yield {
# "Gene_Type": row.gene_type,
# "Gene_Start": row.gene_start,
# "Gene_Stop": row.gene_stop,
# "Strand": row.direction,
# "Protein_Family": row.protein_family
}


def file_ext(choices, fname, parser):
"""
Validates the extension of a given file name against allowed choices.
This function checks if the file extension of `fname` is among the specified `choices`. If not, it raises an error through the provided `parser`.
Parameters
----------
choices (tuple): A tuple of allowed file extensions (without the dot).
fname (str): The file name to be checked.
parser (argparse.ArgumentParser): The argument parser to use for raising an error if the extension is not allowed.
Returns
-------
str: The validated file name if its extension is in `choices`.
Raises
------
argparse.ArgumentError: If the file extension is not in the allowed `choices`.
"""
ext = os.path.splitext(fname)[1][1:]
if ext not in choices:
parser.error("File needs to be a .out or .csv")
return fname


class CustomEncoder(json.JSONEncoder):
"""
Custom JSON encoder for handling numpy data types.
This encoder extends `json.JSONEncoder` to provide custom serialization for certain data types not natively supported by the default JSON encoder. Specifically, it converts `numpy.int64` types to strings.
Methods
-------
default(obj): Overrides the default method to provide custom serialization.
"""

def default(self, obj):
"""
Provides custom serialization for certain data types.
Parameters
----------
obj: The object to serialize.
Returns
-------
The serialized object, converting `numpy.int64` to string. For other types, it relies on the superclass implementation.
Raises
------
TypeError: If the object is not a recognized type and cannot be serialized by the superclass.
"""
if isinstance(obj, np.int64):
return str(obj)
return super().default(obj)


def main():
"""
Main function to compile JSON from a 'cgc_standard.out' file.
This function parses command-line arguments to get input and output file paths, reads the input file, processes and renames columns, and writes the processed data to a JSON file using a custom JSON encoder.
Side Effects:
- Reads a specified input file.
- Writes processed data to a JSON output file.
- Can terminate the script if the input file does not have the correct extension.
"""
parser = argparse.ArgumentParser(description="Compiling Json from cgc_standard.out")
parser.add_argument(
"-i",
required=True,
help="path to output file (cgc_standard.out) file",
type=lambda s: file_ext(("out", "csv"), s, parser),
)
parser.add_argument("-o", "--output")
args = parser.parse_args()

with open(args.i) as file: ### input files
data = pd.read_csv(file, sep="\t")
data.rename(
columns={
"CGC#": "cgc_id",
"Gene Type": "gene_type",
"Contig ID": "contig_id",
"Protein ID": "protein_id",
"Gene Start": "gene_start",
"Gene Stop": "gene_stop",
"Direction": "direction",
"Protein Family": "protein_family",
},
inplace=True,
)
data["gene_type"].fillna("Null_Type", inplace=True)
data["protein_family"].fillna("0", inplace=True)
p = PrePro(data)

pul_list = list(p.pul_section())
pul_dict = {}
for sub_dict in pul_list:
pul_dict.update(sub_dict)
jsonPuls = json.dumps(pul_dict, indent=4, cls=CustomEncoder)

with open(args.output, "w") as outfile:
# with open("Json"+time.strftime("%Y%m%d%H%M%S")+".json","w") as outfile:
outfile.write(jsonPuls)


if __name__ == "__main__":
main()
88 changes: 88 additions & 0 deletions hmmer_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env python3
##########################################################
# hmmsearch parser for dbCAN meta server
#
# Written by Tanner Yohe under the supervision
# of Dr. Yin in the YinLab at NIU.
# Updated by Haidong Yi to reformat the codes.
# Updated by Le Huang from tips the contributor WATSON Mick <[email protected]>,
# Thank you!
#
# Modified by Alex Fraser to have a run() method that can be called and returns data for better integration with other
# scripts. This script also retains the ability to be called from shell and output to pipe redirection.
#
# INPUT
# python hmmer_parser.py [inputFile] [eval] [coverage]
# eval and coverage are optional, inputFile is required
# -updating info:
# -adds pid for every subprocess to make codes robust.
# Last updated: 1/10/19
###########################################################

import os
import sys
from subprocess import call


def run(input_file, eval_num=1e-15, coverage=0.35, verbose=False):
"""
Processes a given input file and returns filtered genomic data.
This function reads an input file, performs various filtering and processing steps, and returns a string containing the processed data. It uses shell commands for data manipulation and filtering based on specified evaluation number and coverage values.
Parameters
----------
input_file (str): The path to the input file containing genomic data.
eval_num (float, optional): The evaluation number threshold for filtering. Defaults to 1e-15.
coverage (float, optional): The coverage threshold for filtering. Defaults to 0.35.
verbose (bool, optional): If True, the function will print detailed processing information. Defaults to False.
Returns
-------
str: A string containing the processed data, where each line represents a row of filtered data based on the evaluation number and coverage criteria.
Side Effects:
- Creates and deletes a temporary file named 'temp.<process_id>'.
- Prints processing details to the standard output if verbose is True.
Note:
This function relies on external shell commands and Perl script for data processing, and it assumes a specific format for the input data.
"""
tmpfile = "temp." + str(os.getpid())

call(
"cat "
+ input_file
+ " | grep -v '^#' | awk '{print $4,$6,$1,$3,$13,$16,$17,$18,$19}' | sed 's/ /\t/g' | sort -k 3,3 -k 8n -k 9n | perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[2]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[4]<$c[4]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_.\"\n\";}}' > "
+ tmpfile,
shell=True,
)

output = ""
with open(tmpfile) as f:
for line in f:
row = line.rstrip().split("\t")
row.append(float(int(row[6]) - int(row[5])) / int(row[1]))
if float(row[4]) <= float(eval_num) and float(row[-1]) >= float(coverage):
if verbose:
print("\t".join([str(x) for x in row]))
output += "\t".join([str(x) for x in row]) + "\n"
# call(["rm", tmpfile])
if os.path.exists(tmpfile):
os.system(f"rm {tmpfile}")

return output


if __name__ == "__main__":
if len(sys.argv) > 3:
file = sys.argv[1]
eval_arg = float(sys.argv[2])
coverage_arg = float(sys.argv[3])
run(file, eval_arg, coverage_arg, verbose=True)
if len(sys.argv) > 1:
file = sys.argv[1]
run(file, verbose=True)
else:
print("Please give a hmmscan output file as the first command")
exit()
Loading

0 comments on commit 3c43a75

Please sign in to comment.