diff --git a/.gitignore b/.gitignore index 7edb243..a3716d5 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,7 @@ # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) ./tests +setup.sh +./build +./docs +./dist \ No newline at end of file diff --git a/build/lib/searcHPV/alignment.py b/build/lib/searcHPV/alignment.py index 02c8c06..c744166 100644 --- a/build/lib/searcHPV/alignment.py +++ b/build/lib/searcHPV/alignment.py @@ -13,7 +13,8 @@ #outputDir: output directory #multi: if fastq file is in gz format: default = True #index: if True, index the reference files; if False, not index the references files -def alignment(fq1, fq2, humRef, virRef, outputDir, index, gz): +#memory: memory size allocated +def alignment(fq1, fq2, humRef, virRef, outputDir, index, gz, memory,thread): #make output dir outputDir = os.path.abspath(outputDir) mkdir(outputDir) @@ -26,8 +27,8 @@ def alignment(fq1, fq2, humRef, virRef, outputDir, index, gz): alignmentFile = scriptDir + "/orignal.alignment.sh" indelFile = scriptDir + "/indel.alignment.sh" - generate_alignment_bash(alignmentFile,ref,fq1,fq2,scriptDir,gz) - generate_indel_alignment_bash(indelFile,ref,scriptDir) + generate_alignment_bash(alignmentFile,ref,fq1,fq2,scriptDir,memory,thread,gz) + generate_indel_alignment_bash(indelFile,ref,scriptDir, memory,thread) check_file(alignmentFile) check_file(indelFile) bashFile = scriptDir + f"/alignment.sh" @@ -51,7 +52,7 @@ def alignment(fq1, fq2, humRef, virRef, outputDir, index, gz): if index: ##generate mkdup alignment bash file mkdupFile = scriptDir + "/mkdup.alignment.sh" - generate_mkdup_bash(mkdupFile,scriptDir) + generate_mkdup_bash(mkdupFile,scriptDir,thread) with open(bashFile,'w') as output: output.write(f'''#!/bin/bash @@ -65,7 +66,7 @@ def alignment(fq1, fq2, humRef, virRef, outputDir, index, gz): else: ##generate mkdup alignment bash file mkdupFile = scriptDir + "/mkdup.alignment.sh" - generate_mkdup_bash(mkdupFile,scriptDir) + generate_mkdup_bash(mkdupFile,scriptDir,thread) with open(bashFile,'w') as output: output.write(f'''#!/bin/bash diff --git a/build/lib/searcHPV/assemble.py b/build/lib/searcHPV/assemble.py index 7564fb0..872d96a 100644 --- a/build/lib/searcHPV/assemble.py +++ b/build/lib/searcHPV/assemble.py @@ -9,7 +9,7 @@ #out_dir: output directory for searcHPV #virRef: virus reference genome #window: the length of region searching for informative reads, default=300 -def assemble(fq1, fq2, out_dir,virRef,gz,window): +def assemble(fq1, fq2, out_dir,virRef,gz,window,memory,thread): bam = f'{out_dir}/alignment/alignment.RG.indelre.mkdup.sort.bam' check_file(bam) assemble_out_dir = f'{out_dir}/assemble/' @@ -20,7 +20,7 @@ def assemble(fq1, fq2, out_dir,virRef,gz,window): subprocess.call(script_read_seq) #preprocessForPear(assemble_out_dir) - script_pear = pear(assemble_out_dir) + script_pear = pear(assemble_out_dir,memory,thread) os.system(f'chmod +x {script_pear}') subprocess.call(script_pear) diff --git a/build/lib/searcHPV/generate_alignment.py b/build/lib/searcHPV/generate_alignment.py index de32c69..a140426 100644 --- a/build/lib/searcHPV/generate_alignment.py +++ b/build/lib/searcHPV/generate_alignment.py @@ -42,21 +42,21 @@ def indexRef(bash_file,ref): #fq2: fastq2 file #out_dir: outputPath #gz: if fastq file is in gz format: default = True -def generate_alignment_bash(bash_file,ref,fq1,fq2,out_dir,gz = True): +def generate_alignment_bash(bash_file,ref,fq1,fq2,out_dir,memory,thread,gz = True): with open(bash_file,'w') as output: if gz: output.write(f''' - bwa mem {ref} ' {out_dir}/alignment.sam - samtools view -bhS {out_dir}/alignment.sam > {out_dir}/alignment.bam - samtools sort {out_dir}/alignment.bam -o {out_dir}/alignment.sort.bam + bwa mem -t {thread} {ref} ' {out_dir}/alignment.sam + samtools view -@ {thread} -bhS {out_dir}/alignment.sam > {out_dir}/alignment.bam + samtools sort -@ {thread} {out_dir}/alignment.bam -o {out_dir}/alignment.sort.bam rm {out_dir}/alignment.sam echo \'alignment done\' ''') else: output.write(f''' - bwa mem {ref} ' {out_dir}/alignment.sam - samtools view -bhS {out_dir}/alignment.sam > {out_dir}/alignment.bam - samtools sort {out_dir}/alignment.bam -o {out_dir}/alignment.sort.bam + bwa mem -t {thread} {ref} ' {out_dir}/alignment.sam + samtools view -@ {thread} -bhS {out_dir}/alignment.sam > {out_dir}/alignment.bam + samtools sort -@ {thread} {out_dir}/alignment.bam -o {out_dir}/alignment.sort.bam rm {out_dir}/alignment.sam echo \'alignment done\' ''') @@ -78,7 +78,7 @@ def generate_alignment_bash(bash_file,ref,fq1,fq2,out_dir,gz = True): #picard: full path of picard #gatk: full path of GATK #java: full path of java -def generate_indel_alignment_bash(bash_File,ref,out_dir): +def generate_indel_alignment_bash(bash_File,ref,out_dir,memory,thread): with open(bash_File,'w') as output: output.write(f'''picard \ AddOrReplaceReadGroups \ @@ -89,35 +89,37 @@ def generate_indel_alignment_bash(bash_File,ref,out_dir): RGPU=NA \ RGSM=sample \ RGLB=sample -samtools sort {out_dir}/alignment.RG.bam -o {out_dir}/alignment.RG.sort.bam -samtools index {out_dir}/alignment.RG.sort.bam +samtools sort -@ {thread} {out_dir}/alignment.RG.bam -o {out_dir}/alignment.RG.sort.bam +samtools index -@ {thread} {out_dir}/alignment.RG.sort.bam GenomeAnalysisTK \ +-Xmx{memory} \ -T RealignerTargetCreator \ -R {ref} \ -I {out_dir}/alignment.RG.sort.bam \ -o {out_dir}/alignment.RG.intervals GenomeAnalysisTK \ +-Xmx{memory} \ -T IndelRealigner \ -R {ref} \ -I {out_dir}/alignment.RG.sort.bam \ -targetIntervals {out_dir}/alignment.RG.intervals \ -o {out_dir}/alignment.RG.indelre.bam -samtools index {out_dir}/alignment.RG.indelre.bam +samtools index -@ {thread} {out_dir}/alignment.RG.indelre.bam echo \'indel alignment done\'''') -def generate_mkdup_bash(bash_File,out_dir): +def generate_mkdup_bash(bash_File,out_dir,thread): with open(bash_File,'w') as output: output.write(f''' -samtools sort -n {out_dir}/alignment.RG.indelre.bam -o {out_dir}/alignment.RG.indelre.sortbyQ.bam +samtools sort -@ {thread} -n {out_dir}/alignment.RG.indelre.bam -o {out_dir}/alignment.RG.indelre.sortbyQ.bam picard MarkDuplicates \ I={out_dir}/alignment.RG.indelre.sortbyQ.bam \ O={out_dir}/alignment.RG.indelre.mkdup.bam \ M={out_dir}/alignment.RG.indelre.mkdup.txt \ TAGGING_POLICY=All ASSUME_SORT_ORDER=queryname -samtools sort {out_dir}/alignment.RG.indelre.mkdup.bam -o {out_dir}/alignment.RG.indelre.mkdup.sort.bam -samtools index {out_dir}/alignment.RG.indelre.mkdup.sort.bam +samtools sort -@ {thread} {out_dir}/alignment.RG.indelre.mkdup.bam -o {out_dir}/alignment.RG.indelre.mkdup.sort.bam +samtools index -@ {thread} {out_dir}/alignment.RG.indelre.mkdup.sort.bam echo \'indel alignment done\'''') diff --git a/build/lib/searcHPV/generate_assemble.py b/build/lib/searcHPV/generate_assemble.py index 302d5a0..60c0da0 100644 --- a/build/lib/searcHPV/generate_assemble.py +++ b/build/lib/searcHPV/generate_assemble.py @@ -3,6 +3,7 @@ import re import pysam from searcHPV.general import * +import string ######################## #get informative reads (SP + PE) from bam file @@ -67,7 +68,9 @@ def extract_read_name(bam,out_dir,virRef,window): if '>' in each: virus_chrm_list.append(each.split()[0].replace('>','')) for virus_chrm in virus_chrm_list: - fusionRes = f'{out_dir}/call_fusion/{virus_chrm}.all.filtered.clustered.result' + invalidCharacter = re.escape(string.punctuation) + virus_chrm_file_name = re.sub(r'['+invalidCharacter+']',"_",virus_chrm) + fusionRes = f'{out_dir}/call_fusion/{virus_chrm_file_name}.all.filtered.clustered.result' check_file(fusionRes) with open(fusionRes) as candidate_in: candidate_in = candidate_in.read().rstrip() @@ -172,7 +175,7 @@ def preprocessForPear(out_dir): #Function:generate bash script for PEAR #out_dir:output directory for assemble #return:path of bash script -def pear(out_dir): +def pear(out_dir,memory,thread): listSites = os.listdir(out_dir) with open(f'{out_dir}/pear.sh','w') as output: output.write('#!/bin/bash\n') @@ -184,6 +187,8 @@ def pear(out_dir): os.mkdir(outputPath) output.write(f''' pear \ + -j {thread} \ + -y {memory} \ -f {fqPath}/{site}.informativeReads.1.fq \ -r {fqPath}/{site}.informativeReads.2.fq \ -o {outputPath}/{site}''') diff --git a/build/lib/searcHPV/generate_call_fusion.py b/build/lib/searcHPV/generate_call_fusion.py index 464c8ff..d9fae38 100644 --- a/build/lib/searcHPV/generate_call_fusion.py +++ b/build/lib/searcHPV/generate_call_fusion.py @@ -2,6 +2,8 @@ import sys from searcHPV.general import * import re +import string +import re ################## #find the end of each cigar string slot @@ -153,12 +155,13 @@ def define_fusion(bam,virus_chrm,out_dir): if read.next_reference_name == virus_chrm: paired_evidence_count += 1 b[i] += (str(paired_evidence_count),) - - with open(f'{out_dir}/{virus_chrm}.genome_fusion.txt','w') as output: + invalidCharacter = re.escape(string.punctuation) + virus_chrm_file_name = re.sub(r'['+invalidCharacter+']',"_",virus_chrm) + with open(f'{out_dir}/{virus_chrm_file_name}.genome_fusion.txt','w') as output: output.write('chrm\tpos\tsingle_evidence\tpaired_evidence\n') for i in b: output.write(f'{i[0]}\t{i[1]}\t{i[2]}\t{i[3]}\n') - return f'{out_dir}/{virus_chrm}.genome_fusion.txt' + return f'{out_dir}/{virus_chrm_file_name}.genome_fusion.txt' ################## #cluster fusion points within certain base pair diff --git a/build/lib/searcHPV/generate_call_fusion_virus.py b/build/lib/searcHPV/generate_call_fusion_virus.py index bdc860f..715ae1b 100644 --- a/build/lib/searcHPV/generate_call_fusion_virus.py +++ b/build/lib/searcHPV/generate_call_fusion_virus.py @@ -8,7 +8,7 @@ #out_dir:out_dir for searcHPV #humRef: human reference genome #return: path of bash script -def mapToRef(out_dir): +def mapToRef(out_dir,thread): newRef = f'{out_dir}/hg_hpv.fa' sitesPath = f'{out_dir}/assemble/' listSites = os.listdir(sitesPath) @@ -23,11 +23,11 @@ def mapToRef(out_dir): contigPath = f'{sitesPath}/{site}/pearOutput/' sitePath = f'{out_dir}/call_fusion_virus/{site}/' mkdir(sitePath) - bashFile.write( f'''bwa mem -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ + bashFile.write( f'''bwa mem -t {thread} -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ {newRef} {contigPath}/{site}.all.fa.cap.contigs > {sitePath}/{site}.contig.sam; -samtools view -bhS {sitePath}/{site}.contig.sam > {sitePath}/{site}.contig.bam; -samtools sort {sitePath}/{site}.contig.bam -o {sitePath}/{site}.contig.sort.bam; -samtools index {sitePath}/{site}.contig.sort.bam; +samtools view -@ {thread} -bhS {sitePath}/{site}.contig.sam > {sitePath}/{site}.contig.bam; +samtools sort -@ {thread} {sitePath}/{site}.contig.bam -o {sitePath}/{site}.contig.sort.bam; +samtools index -@ {thread} {sitePath}/{site}.contig.sort.bam; rm {sitePath}/{site}.contig.bam;\n''') return f'{outputPath}/alignContigsToRef.sh' @@ -39,7 +39,7 @@ def mapToRef(out_dir): #out_dir:out_dir for searcHPV #humRef: human reference genome #return: path of bash script -def mapToHgRef(out_dir,humRef): +def mapToHgRef(out_dir,humRef,thread): sitesPath = f'{out_dir}/assemble/' listSites = os.listdir(sitesPath) outputPath = f'{out_dir}/call_fusion_virus/' @@ -53,11 +53,11 @@ def mapToHgRef(out_dir,humRef): contigPath = f'{sitesPath}/{site}/pearOutput/' sitePath = f'{out_dir}/call_fusion_virus/{site}/' mkdir(sitePath) - bashFile.write( f'''bwa mem -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ + bashFile.write( f'''bwa mem -t {thread} -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ {humRef} {contigPath}/{site}.all.fa.cap.contigs > {sitePath}/{site}.contigToGenome.sam; -samtools view -bhS {sitePath}/{site}.contigToGenome.sam > {sitePath}/{site}.contigToGenome.bam; -samtools sort {sitePath}/{site}.contigToGenome.bam -o {sitePath}/{site}.contigToGenome.sort.bam; -samtools index {sitePath}/{site}.contigToGenome.sort.bam; +samtools view -@ {thread} -bhS {sitePath}/{site}.contigToGenome.sam > {sitePath}/{site}.contigToGenome.bam; +samtools sort -@ {thread} {sitePath}/{site}.contigToGenome.bam -o {sitePath}/{site}.contigToGenome.sort.bam; +samtools index -@ {thread} {sitePath}/{site}.contigToGenome.sort.bam; rm {sitePath}/{site}.contigToGenome.bam;\n''') return f'{outputPath}/alignContigsToGenome.sh' @@ -66,7 +66,7 @@ def mapToHgRef(out_dir,humRef): #out_dir:out_dir for searcHPV #virRef: virus reference genome #return: path of bash script -def mapToVirRef(out_dir,virRef): +def mapToVirRef(out_dir,virRef,thread): sitesPath = f'{out_dir}/assemble/' listSites = os.listdir(sitesPath) outputPath = f'{out_dir}/call_fusion_virus/' @@ -79,12 +79,12 @@ def mapToVirRef(out_dir,virRef): if ".sh" not in site: contigPath = f'{sitesPath}/{site}/pearOutput/' sitePath = f'{out_dir}/call_fusion_virus/{site}/' - bashFile.write( f'''bwa mem -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ + bashFile.write( f'''bwa mem -t {thread} -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ {virRef} {contigPath}/{site}.all.fa.cap.contigs > {contigPath}/{site}.contigToVirus.sam; -samtools view -bhS {contigPath}/{site}.contigToVirus.sam > {contigPath}/{site}.contigToVirus.bam; -samtools sort {contigPath}/{site}.contigToVirus.bam -o {sitePath}/{site}.contigToVirus.sort.bam; -samtools index {sitePath}/{site}.contigToVirus.sort.bam; -samtools faidx {contigPath}/{site}.all.fa.cap.contigs; +samtools view -@ {thread} -bhS {contigPath}/{site}.contigToVirus.sam > {contigPath}/{site}.contigToVirus.bam; +samtools sort -@ {thread} {contigPath}/{site}.contigToVirus.bam -o {sitePath}/{site}.contigToVirus.sort.bam; +samtools index -@ {thread} {sitePath}/{site}.contigToVirus.sort.bam; +samtools faidx -@ {thread} {contigPath}/{site}.all.fa.cap.contigs; rm {contigPath}/{site}.contigToVirus.bam;\n''') return f'{outputPath}/alignContigsToVirus.sh' diff --git a/build/lib/searcHPV/genome_fusion.py b/build/lib/searcHPV/genome_fusion.py index be8685f..36e419e 100644 --- a/build/lib/searcHPV/genome_fusion.py +++ b/build/lib/searcHPV/genome_fusion.py @@ -2,6 +2,7 @@ import sys from searcHPV.general import * from searcHPV.generate_call_fusion import * +import string def genomeFusion(window,out_dir,virRef): @@ -24,7 +25,10 @@ def genomeFusion(window,out_dir,virRef): #filter and cluster fusion points ##sort result - os.system(f'(head -n 1 {res} && tail -n +2 {res} | sort -k3,3rn) > {out_dir}/{virus_chrm}.genome_fusion.sort.txt') + #if there are invalid characters in the chromosome name: + invalidCharacter = re.escape(string.punctuation) + virus_chrm_file_name = re.sub(r'['+invalidCharacter+']',"_",virus_chrm) + os.system(f'(head -n 1 {res} && tail -n +2 {res} | sort -k3,3rn) > {out_dir}/{virus_chrm_file_name}.genome_fusion.sort.txt') #change format for cluster @@ -32,9 +36,9 @@ def genomeFusion(window,out_dir,virRef): chrm_li = list(map(str,chrm_li)) chrm_li+=['X','Y'] - with open(f'{out_dir}/{virus_chrm}.all.result','w') as output: + with open(f'{out_dir}/{virus_chrm_file_name}.all.result','w') as output: fusion_li = [] - with open(f'{out_dir}/{virus_chrm}.genome_fusion.sort.txt') as res: + with open(f'{out_dir}/{virus_chrm_file_name}.genome_fusion.sort.txt') as res: res.readline() for line in res.read().rstrip().split('\n'): elements = line.rstrip().split('\t') @@ -51,12 +55,12 @@ def genomeFusion(window,out_dir,virRef): output.write(f'{to_print}') ##cluster the events within 100bp from each other, maybe becasue of SVs or CNVs - cluster_result(f'{out_dir}/{virus_chrm}.all.result',f'{out_dir}/{virus_chrm}.all.clustered.result',window) + cluster_result(f'{out_dir}/{virus_chrm_file_name}.all.result',f'{out_dir}/{virus_chrm_file_name}.all.clustered.result',window) ##filter for sites with at least 2 split read and 2 pairs of read support(high cutoff) and their summation greater than 5 - with open(f'{out_dir}/{virus_chrm}.all.clustered.result') as inf: - with open(f'{out_dir}/{virus_chrm}.all.filtered.clustered.result','w') as outf: + with open(f'{out_dir}/{virus_chrm_file_name}.all.clustered.result') as inf: + with open(f'{out_dir}/{virus_chrm_file_name}.all.filtered.clustered.result','w') as outf: for line in inf.read().rstrip().split('\n'): elements = line.rstrip() if elements == "": diff --git a/build/lib/searcHPV/main.py b/build/lib/searcHPV/main.py index 24b6c82..fd41f4d 100644 --- a/build/lib/searcHPV/main.py +++ b/build/lib/searcHPV/main.py @@ -61,6 +61,13 @@ def main(): default = False, dest ='index', help ="index the original human and virus reference files, default=False") + parser.add_argument('-memory', type = str, default="10G",dest = 'memory', + help ='specify the maximum size, default "10G"', + ) + parser.add_argument('-thread', type = int, default=1,dest = 'thread', + help ='number of threads, default 1', + ) + @@ -72,19 +79,19 @@ def main(): if args.alignment: - alignment(fq1 = args.fq1, fq2 = args.fq2, humRef = args.humRef, virRef = args.virRef, index = args.index, outputDir = args.outputDir, gz = args.gz) + alignment(fq1 = args.fq1, fq2 = args.fq2, humRef = args.humRef, virRef = args.virRef, index = args.index, outputDir = args.outputDir, gz = args.gz, memory = args.memory,thread = args.thread) elif args.genomeFusion: genomeFusion(args.clusterWindow,args.outputDir,args.virRef) elif args.assemble: #check result from genomeFusion - assemble(args.fq1, args.fq2, args.outputDir,args.virRef,args.gz,args.window) + assemble(args.fq1, args.fq2, args.outputDir,args.virRef,args.gz,args.window, memory = args.memory,thread = args.thread) elif args.hpvFusion: #check result from assemble - virus_fusion(args.humRef,args.virRef,args.outputDir,args.n) + virus_fusion(args.humRef,args.virRef,args.outputDir,args.n,thread = args.thread) else: - alignment(fq1 = args.fq1, fq2 = args.fq2, humRef = args.humRef, virRef = args.virRef, index = args.index, outputDir = args.outputDir, gz = args.gz) + alignment(fq1 = args.fq1, fq2 = args.fq2, humRef = args.humRef, virRef = args.virRef, index = args.index, outputDir = args.outputDir, gz = args.gz, memory = args.memory,thread = args.thread) genomeFusion(args.clusterWindow,args.outputDir,args.virRef) - assemble(args.fq1, args.fq2, args.outputDir,args.virRef,args.gz,args.window) - virus_fusion(args.humRef,args.virRef,args.outputDir,args.n) + assemble(args.fq1, args.fq2, args.outputDir,args.virRef,args.gz,args.window, memory = args.memory,thread = args.thread) + virus_fusion(args.humRef,args.virRef,args.outputDir,args.n,thread = args.thread) \ No newline at end of file diff --git a/build/lib/searcHPV/virus_fusion.py b/build/lib/searcHPV/virus_fusion.py index 42a1562..89f3acf 100644 --- a/build/lib/searcHPV/virus_fusion.py +++ b/build/lib/searcHPV/virus_fusion.py @@ -9,10 +9,10 @@ #virRef: virus reference genome #out_dir: output directory for seacHPV #n: poly(n), n*d(A/T/C/G), will report low confidence if contig contains poly(n) -def virus_fusion(humRef,virRef,out_dir,n): - script_map = mapToRef(out_dir) - script_mapHg = mapToHgRef(out_dir,humRef) - script_mapVir = mapToVirRef(out_dir,virRef) +def virus_fusion(humRef,virRef,out_dir,n,thread): + script_map = mapToRef(out_dir,thread) + script_mapHg = mapToHgRef(out_dir,humRef,thread) + script_mapVir = mapToVirRef(out_dir,virRef,thread) os.system(f'chmod +x {script_map}') subprocess.call(script_map) os.system(f'chmod +x {script_mapHg}') diff --git a/dist/searcHPV-1.0.10-py3-none-any.whl b/dist/searcHPV-1.0.10-py3-none-any.whl new file mode 100644 index 0000000..42461ff Binary files /dev/null and b/dist/searcHPV-1.0.10-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.10-py3.7.egg b/dist/searcHPV-1.0.10-py3.7.egg new file mode 100644 index 0000000..9322aaa Binary files /dev/null and b/dist/searcHPV-1.0.10-py3.7.egg differ diff --git a/dist/searcHPV-1.0.10-py3.8.egg b/dist/searcHPV-1.0.10-py3.8.egg new file mode 100644 index 0000000..5dbb9e6 Binary files /dev/null and b/dist/searcHPV-1.0.10-py3.8.egg differ diff --git a/dist/searcHPV-1.0.10.tar.gz b/dist/searcHPV-1.0.10.tar.gz new file mode 100644 index 0000000..78fc0a6 Binary files /dev/null and b/dist/searcHPV-1.0.10.tar.gz differ diff --git a/dist/searcHPV-1.0.11-py3-none-any.whl b/dist/searcHPV-1.0.11-py3-none-any.whl new file mode 100644 index 0000000..d483e9d Binary files /dev/null and b/dist/searcHPV-1.0.11-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.11-py3.8.egg b/dist/searcHPV-1.0.11-py3.8.egg new file mode 100644 index 0000000..4f6116d Binary files /dev/null and b/dist/searcHPV-1.0.11-py3.8.egg differ diff --git a/dist/searcHPV-1.0.11.tar.gz b/dist/searcHPV-1.0.11.tar.gz new file mode 100644 index 0000000..ddf0e02 Binary files /dev/null and b/dist/searcHPV-1.0.11.tar.gz differ diff --git a/dist/searcHPV-1.0.12-py3-none-any.whl b/dist/searcHPV-1.0.12-py3-none-any.whl new file mode 100644 index 0000000..686b1fa Binary files /dev/null and b/dist/searcHPV-1.0.12-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.12-py3.8.egg b/dist/searcHPV-1.0.12-py3.8.egg new file mode 100644 index 0000000..5e9e678 Binary files /dev/null and b/dist/searcHPV-1.0.12-py3.8.egg differ diff --git a/dist/searcHPV-1.0.12.tar.gz b/dist/searcHPV-1.0.12.tar.gz new file mode 100644 index 0000000..c017580 Binary files /dev/null and b/dist/searcHPV-1.0.12.tar.gz differ diff --git a/dist/searcHPV-1.0.13-py3-none-any.whl b/dist/searcHPV-1.0.13-py3-none-any.whl new file mode 100644 index 0000000..d849978 Binary files /dev/null and b/dist/searcHPV-1.0.13-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.13-py3.8.egg b/dist/searcHPV-1.0.13-py3.8.egg new file mode 100644 index 0000000..da33549 Binary files /dev/null and b/dist/searcHPV-1.0.13-py3.8.egg differ diff --git a/dist/searcHPV-1.0.13.tar.gz b/dist/searcHPV-1.0.13.tar.gz new file mode 100644 index 0000000..6cbfffd Binary files /dev/null and b/dist/searcHPV-1.0.13.tar.gz differ diff --git a/dist/searcHPV-1.0.14-py3-none-any.whl b/dist/searcHPV-1.0.14-py3-none-any.whl new file mode 100644 index 0000000..d3d9978 Binary files /dev/null and b/dist/searcHPV-1.0.14-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.14-py3.8.egg b/dist/searcHPV-1.0.14-py3.8.egg new file mode 100644 index 0000000..6f42e38 Binary files /dev/null and b/dist/searcHPV-1.0.14-py3.8.egg differ diff --git a/dist/searcHPV-1.0.14.tar.gz b/dist/searcHPV-1.0.14.tar.gz new file mode 100644 index 0000000..4537cc1 Binary files /dev/null and b/dist/searcHPV-1.0.14.tar.gz differ diff --git a/dist/searcHPV-1.0.15-py3-none-any.whl b/dist/searcHPV-1.0.15-py3-none-any.whl new file mode 100644 index 0000000..f5f9aa7 Binary files /dev/null and b/dist/searcHPV-1.0.15-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.15-py3.8.egg b/dist/searcHPV-1.0.15-py3.8.egg new file mode 100644 index 0000000..887fce8 Binary files /dev/null and b/dist/searcHPV-1.0.15-py3.8.egg differ diff --git a/dist/searcHPV-1.0.15.tar.gz b/dist/searcHPV-1.0.15.tar.gz new file mode 100644 index 0000000..ad36eac Binary files /dev/null and b/dist/searcHPV-1.0.15.tar.gz differ diff --git a/dist/searcHPV-1.0.16-py3-none-any.whl b/dist/searcHPV-1.0.16-py3-none-any.whl new file mode 100644 index 0000000..4722697 Binary files /dev/null and b/dist/searcHPV-1.0.16-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.16-py3.7.egg b/dist/searcHPV-1.0.16-py3.7.egg new file mode 100644 index 0000000..1f7a381 Binary files /dev/null and b/dist/searcHPV-1.0.16-py3.7.egg differ diff --git a/dist/searcHPV-1.0.16-py3.8.egg b/dist/searcHPV-1.0.16-py3.8.egg new file mode 100644 index 0000000..f40d7fc Binary files /dev/null and b/dist/searcHPV-1.0.16-py3.8.egg differ diff --git a/dist/searcHPV-1.0.16.tar.gz b/dist/searcHPV-1.0.16.tar.gz new file mode 100644 index 0000000..52fc07f Binary files /dev/null and b/dist/searcHPV-1.0.16.tar.gz differ diff --git a/dist/searcHPV-1.0.17-py3-none-any.whl b/dist/searcHPV-1.0.17-py3-none-any.whl new file mode 100644 index 0000000..bf34e41 Binary files /dev/null and b/dist/searcHPV-1.0.17-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.17-py3.8.egg b/dist/searcHPV-1.0.17-py3.8.egg new file mode 100644 index 0000000..a784bf0 Binary files /dev/null and b/dist/searcHPV-1.0.17-py3.8.egg differ diff --git a/dist/searcHPV-1.0.17.tar.gz b/dist/searcHPV-1.0.17.tar.gz new file mode 100644 index 0000000..9252322 Binary files /dev/null and b/dist/searcHPV-1.0.17.tar.gz differ diff --git a/dist/searcHPV-1.0.3-py3-none-any.whl b/dist/searcHPV-1.0.3-py3-none-any.whl new file mode 100644 index 0000000..2469674 Binary files /dev/null and b/dist/searcHPV-1.0.3-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.3-py3.8.egg b/dist/searcHPV-1.0.3-py3.8.egg new file mode 100644 index 0000000..9acb094 Binary files /dev/null and b/dist/searcHPV-1.0.3-py3.8.egg differ diff --git a/dist/searcHPV-1.0.3.tar.gz b/dist/searcHPV-1.0.3.tar.gz new file mode 100644 index 0000000..3a5e1d9 Binary files /dev/null and b/dist/searcHPV-1.0.3.tar.gz differ diff --git a/dist/searcHPV-1.0.4-py3-none-any.whl b/dist/searcHPV-1.0.4-py3-none-any.whl new file mode 100644 index 0000000..1a443be Binary files /dev/null and b/dist/searcHPV-1.0.4-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.4-py3.8.egg b/dist/searcHPV-1.0.4-py3.8.egg new file mode 100644 index 0000000..614f31c Binary files /dev/null and b/dist/searcHPV-1.0.4-py3.8.egg differ diff --git a/dist/searcHPV-1.0.4.tar.gz b/dist/searcHPV-1.0.4.tar.gz new file mode 100644 index 0000000..f6a2dab Binary files /dev/null and b/dist/searcHPV-1.0.4.tar.gz differ diff --git a/dist/searcHPV-1.0.5-py3-none-any.whl b/dist/searcHPV-1.0.5-py3-none-any.whl new file mode 100644 index 0000000..daa3e35 Binary files /dev/null and b/dist/searcHPV-1.0.5-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.5-py3.8.egg b/dist/searcHPV-1.0.5-py3.8.egg new file mode 100644 index 0000000..5ca4fbd Binary files /dev/null and b/dist/searcHPV-1.0.5-py3.8.egg differ diff --git a/dist/searcHPV-1.0.5.tar.gz b/dist/searcHPV-1.0.5.tar.gz new file mode 100644 index 0000000..d7bed92 Binary files /dev/null and b/dist/searcHPV-1.0.5.tar.gz differ diff --git a/dist/searcHPV-1.0.6-py3-none-any.whl b/dist/searcHPV-1.0.6-py3-none-any.whl new file mode 100644 index 0000000..a3caae7 Binary files /dev/null and b/dist/searcHPV-1.0.6-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.6-py3.8.egg b/dist/searcHPV-1.0.6-py3.8.egg new file mode 100644 index 0000000..8d5daf7 Binary files /dev/null and b/dist/searcHPV-1.0.6-py3.8.egg differ diff --git a/dist/searcHPV-1.0.6.tar.gz b/dist/searcHPV-1.0.6.tar.gz new file mode 100644 index 0000000..b6b1044 Binary files /dev/null and b/dist/searcHPV-1.0.6.tar.gz differ diff --git a/dist/searcHPV-1.0.7-py3-none-any.whl b/dist/searcHPV-1.0.7-py3-none-any.whl new file mode 100644 index 0000000..09bdb9a Binary files /dev/null and b/dist/searcHPV-1.0.7-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.7-py3.8.egg b/dist/searcHPV-1.0.7-py3.8.egg new file mode 100644 index 0000000..b43fba3 Binary files /dev/null and b/dist/searcHPV-1.0.7-py3.8.egg differ diff --git a/dist/searcHPV-1.0.7.tar.gz b/dist/searcHPV-1.0.7.tar.gz new file mode 100644 index 0000000..d9bb72c Binary files /dev/null and b/dist/searcHPV-1.0.7.tar.gz differ diff --git a/dist/searcHPV-1.0.8-py3-none-any.whl b/dist/searcHPV-1.0.8-py3-none-any.whl new file mode 100644 index 0000000..00b12d4 Binary files /dev/null and b/dist/searcHPV-1.0.8-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.8-py3.8.egg b/dist/searcHPV-1.0.8-py3.8.egg new file mode 100644 index 0000000..c320dec Binary files /dev/null and b/dist/searcHPV-1.0.8-py3.8.egg differ diff --git a/dist/searcHPV-1.0.8.tar.gz b/dist/searcHPV-1.0.8.tar.gz new file mode 100644 index 0000000..4688268 Binary files /dev/null and b/dist/searcHPV-1.0.8.tar.gz differ diff --git a/dist/searcHPV-1.0.9-py3-none-any.whl b/dist/searcHPV-1.0.9-py3-none-any.whl new file mode 100644 index 0000000..cc8a5a5 Binary files /dev/null and b/dist/searcHPV-1.0.9-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9-py3.8.egg b/dist/searcHPV-1.0.9-py3.8.egg new file mode 100644 index 0000000..586905d Binary files /dev/null and b/dist/searcHPV-1.0.9-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.1-py3-none-any.whl b/dist/searcHPV-1.0.9.1-py3-none-any.whl new file mode 100644 index 0000000..408d890 Binary files /dev/null and b/dist/searcHPV-1.0.9.1-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.1-py3.8.egg b/dist/searcHPV-1.0.9.1-py3.8.egg new file mode 100644 index 0000000..9f9ecb2 Binary files /dev/null and b/dist/searcHPV-1.0.9.1-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.1.tar.gz b/dist/searcHPV-1.0.9.1.tar.gz new file mode 100644 index 0000000..5ffef20 Binary files /dev/null and b/dist/searcHPV-1.0.9.1.tar.gz differ diff --git a/dist/searcHPV-1.0.9.10-py3-none-any.whl b/dist/searcHPV-1.0.9.10-py3-none-any.whl new file mode 100644 index 0000000..f8ec3cb Binary files /dev/null and b/dist/searcHPV-1.0.9.10-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.10-py3.8.egg b/dist/searcHPV-1.0.9.10-py3.8.egg new file mode 100644 index 0000000..e0d1f64 Binary files /dev/null and b/dist/searcHPV-1.0.9.10-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.10.tar.gz b/dist/searcHPV-1.0.9.10.tar.gz new file mode 100644 index 0000000..04d01b8 Binary files /dev/null and b/dist/searcHPV-1.0.9.10.tar.gz differ diff --git a/dist/searcHPV-1.0.9.11-py3-none-any.whl b/dist/searcHPV-1.0.9.11-py3-none-any.whl new file mode 100644 index 0000000..b4abc23 Binary files /dev/null and b/dist/searcHPV-1.0.9.11-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.11-py3.8.egg b/dist/searcHPV-1.0.9.11-py3.8.egg new file mode 100644 index 0000000..d4b5555 Binary files /dev/null and b/dist/searcHPV-1.0.9.11-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.11.tar.gz b/dist/searcHPV-1.0.9.11.tar.gz new file mode 100644 index 0000000..fae58ba Binary files /dev/null and b/dist/searcHPV-1.0.9.11.tar.gz differ diff --git a/dist/searcHPV-1.0.9.2-py3-none-any.whl b/dist/searcHPV-1.0.9.2-py3-none-any.whl new file mode 100644 index 0000000..c3e730b Binary files /dev/null and b/dist/searcHPV-1.0.9.2-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.2-py3.8.egg b/dist/searcHPV-1.0.9.2-py3.8.egg new file mode 100644 index 0000000..d33868e Binary files /dev/null and b/dist/searcHPV-1.0.9.2-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.2.tar.gz b/dist/searcHPV-1.0.9.2.tar.gz new file mode 100644 index 0000000..8cf2bd5 Binary files /dev/null and b/dist/searcHPV-1.0.9.2.tar.gz differ diff --git a/dist/searcHPV-1.0.9.3-py3-none-any.whl b/dist/searcHPV-1.0.9.3-py3-none-any.whl new file mode 100644 index 0000000..0e921b4 Binary files /dev/null and b/dist/searcHPV-1.0.9.3-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.3-py3.8.egg b/dist/searcHPV-1.0.9.3-py3.8.egg new file mode 100644 index 0000000..2606a5c Binary files /dev/null and b/dist/searcHPV-1.0.9.3-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.3.tar.gz b/dist/searcHPV-1.0.9.3.tar.gz new file mode 100644 index 0000000..6ed18f8 Binary files /dev/null and b/dist/searcHPV-1.0.9.3.tar.gz differ diff --git a/dist/searcHPV-1.0.9.4-py3-none-any.whl b/dist/searcHPV-1.0.9.4-py3-none-any.whl new file mode 100644 index 0000000..9734db6 Binary files /dev/null and b/dist/searcHPV-1.0.9.4-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.4-py3.8.egg b/dist/searcHPV-1.0.9.4-py3.8.egg new file mode 100644 index 0000000..ed2edc8 Binary files /dev/null and b/dist/searcHPV-1.0.9.4-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.4.tar.gz b/dist/searcHPV-1.0.9.4.tar.gz new file mode 100644 index 0000000..5c7e028 Binary files /dev/null and b/dist/searcHPV-1.0.9.4.tar.gz differ diff --git a/dist/searcHPV-1.0.9.5-py3-none-any.whl b/dist/searcHPV-1.0.9.5-py3-none-any.whl new file mode 100644 index 0000000..a4c6e6c Binary files /dev/null and b/dist/searcHPV-1.0.9.5-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.5-py3.8.egg b/dist/searcHPV-1.0.9.5-py3.8.egg new file mode 100644 index 0000000..58de073 Binary files /dev/null and b/dist/searcHPV-1.0.9.5-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.5.tar.gz b/dist/searcHPV-1.0.9.5.tar.gz new file mode 100644 index 0000000..b9cc673 Binary files /dev/null and b/dist/searcHPV-1.0.9.5.tar.gz differ diff --git a/dist/searcHPV-1.0.9.6-py3-none-any.whl b/dist/searcHPV-1.0.9.6-py3-none-any.whl new file mode 100644 index 0000000..7a92d9d Binary files /dev/null and b/dist/searcHPV-1.0.9.6-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.6-py3.8.egg b/dist/searcHPV-1.0.9.6-py3.8.egg new file mode 100644 index 0000000..316c8dd Binary files /dev/null and b/dist/searcHPV-1.0.9.6-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.6.tar.gz b/dist/searcHPV-1.0.9.6.tar.gz new file mode 100644 index 0000000..b0a9acf Binary files /dev/null and b/dist/searcHPV-1.0.9.6.tar.gz differ diff --git a/dist/searcHPV-1.0.9.7-py3-none-any.whl b/dist/searcHPV-1.0.9.7-py3-none-any.whl new file mode 100644 index 0000000..3555410 Binary files /dev/null and b/dist/searcHPV-1.0.9.7-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.7-py3.8.egg b/dist/searcHPV-1.0.9.7-py3.8.egg new file mode 100644 index 0000000..0b99c95 Binary files /dev/null and b/dist/searcHPV-1.0.9.7-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.7.tar.gz b/dist/searcHPV-1.0.9.7.tar.gz new file mode 100644 index 0000000..129a354 Binary files /dev/null and b/dist/searcHPV-1.0.9.7.tar.gz differ diff --git a/dist/searcHPV-1.0.9.8-py3-none-any.whl b/dist/searcHPV-1.0.9.8-py3-none-any.whl new file mode 100644 index 0000000..4bf9b11 Binary files /dev/null and b/dist/searcHPV-1.0.9.8-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.8-py3.8.egg b/dist/searcHPV-1.0.9.8-py3.8.egg new file mode 100644 index 0000000..3fab303 Binary files /dev/null and b/dist/searcHPV-1.0.9.8-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.8.tar.gz b/dist/searcHPV-1.0.9.8.tar.gz new file mode 100644 index 0000000..8acd050 Binary files /dev/null and b/dist/searcHPV-1.0.9.8.tar.gz differ diff --git a/dist/searcHPV-1.0.9.9-py3-none-any.whl b/dist/searcHPV-1.0.9.9-py3-none-any.whl new file mode 100644 index 0000000..a3c61c3 Binary files /dev/null and b/dist/searcHPV-1.0.9.9-py3-none-any.whl differ diff --git a/dist/searcHPV-1.0.9.9-py3.8.egg b/dist/searcHPV-1.0.9.9-py3.8.egg new file mode 100644 index 0000000..5dca0ba Binary files /dev/null and b/dist/searcHPV-1.0.9.9-py3.8.egg differ diff --git a/dist/searcHPV-1.0.9.9.tar.gz b/dist/searcHPV-1.0.9.9.tar.gz new file mode 100644 index 0000000..e65ca58 Binary files /dev/null and b/dist/searcHPV-1.0.9.9.tar.gz differ diff --git a/dist/searcHPV-1.0.9.tar.gz b/dist/searcHPV-1.0.9.tar.gz new file mode 100644 index 0000000..397e86a Binary files /dev/null and b/dist/searcHPV-1.0.9.tar.gz differ diff --git a/environment.yaml b/environment.yaml index ee23930..1beca53 100644 --- a/environment.yaml +++ b/environment.yaml @@ -14,7 +14,7 @@ dependencies: - cap3=10.2011 - pysam=0.15.3 - pandas=1.2.4 - - pip + - pip=22.3.1 - pip: - argparse - searcHPV diff --git a/environment_2.yaml b/environment_2.yaml new file mode 100644 index 0000000..9f1ea0e --- /dev/null +++ b/environment_2.yaml @@ -0,0 +1,21 @@ +name: searcHPV_2 +channels: + - bioconda + - defaults + - r +dependencies: + - python=3.7.3 + - samtools=1.9 + - bwa=0.7.15 + - java-jdk=8.0 + - gatk=3.8 + - picard=2.23.8 + - pear=0.9.6 + - cap3=10.2011 + - pysam=0.15.3 + - pandas=1.2.4 + - pip + - pip: + - argparse + - searcHPV + diff --git a/searcHPV.egg-info/PKG-INFO b/searcHPV.egg-info/PKG-INFO index f3c59ed..7b65504 100644 --- a/searcHPV.egg-info/PKG-INFO +++ b/searcHPV.egg-info/PKG-INFO @@ -1,104 +1,11 @@ Metadata-Version: 2.1 Name: searcHPV -Version: 1.0.3 +Version: 1.0.17 Summary: An HPV integration sites detection tool for targeted capture sequencing data Home-page: https://github.com/WenjinGudaisy/SearcHPV Author: Wenjin Gu Author-email: wenjingu@umich.edu -License: UNKNOWN -Description: # SearcHPV - An HPV integration point detection tool for targeted capture sequencing data - - ## Introdution - * SearcHPV detects HPV fusion sites on both human genome and HPV genome - * SearcHPV is able to provide locally assembled contigs for each integration events. It will report at least one and at most two contigs for each integration sites. The two contigs will provide information captured for left and right sides of the event. - - ## Getting started - 1. Required resources - * Unix like environment - * Third-party tools: - ``` - Python/3.7.3 https://www.python.org/downloads/release/python-373/ - samtools/1.5 https://github.com/samtools/samtools/releases/tag/1.5 - BWA/0.7.15-r1140 https://github.com/lh3/bwa/releases/tag/v0.7.15 - java/1.8.0_252 https://www.oracle.com/java/technologies/javase/8all-relnotes.html - Picard Tools/2.23.8 https://github.com/broadinstitute/picard/releases/tag/2.23.8 - PEAR/0.9.2 https://github.com/tseemann/PEAR - CAP3/02/10/15 http://seq.cs.iastate.edu/cap3.html - - ``` - After intalling these tools, please make sure that their path have been added to you ".bashrc" script so that you can use them by typing the tool names in the terminal. - - 2. Download and install - Firstly, download and install the required resources. - Then, tap these commands in your terminal: - ``` - pip install searcHPV - - ``` - - 3. Usage - SearcHPV have four main steps. You could either run it start-to-finish or run it step-by-step. - - * Usage: - ``` - searcHPV ... - ``` - * Standard options: - ``` - -fastq1 sequencing data: fastq/fq.gz file - -fastq2 sequencing data: fastq/fq.gz file - -humRef human reference genome: fasta file - -virRef HPV reference genome: fasta file - ``` - * Optional options: - ``` - -h, --help show this help message and exit - -window the length of region searching for informative reads, default=300 - -output output directory, default "./" - -alignment run the alignment step, step1 - -genomeFusion call the genome fusion points, step2 - -assemble local assemble for each integration event, step3 - -hpvFusion call the HPV fusion points, step4 - - ``` - * Examples: - 1) Run it start-to-finish: - ``` - searcHPV -fastq1 Sample_81279.R1.fastq.gz -fastq2 Sample_81279.R2.fastq.gz -humRef hs37d5.fa -virRef HPV.fa -output /home/scratch/HPV_fusion/Sample_81279 - - ``` - 2) Run it step-by-step: - ``` - searchHPV -align -fastq1 Sample_81279.R1.fastq.gz -fastq2 Sample_81279.R2.fastq.gz -humRef hs37d5.fa -virRef HPV.fa -output /home/scratch/HPV_fusion/Sample_81279 - searchHPV -genomeFusion -fastq1 Sample_81279.R1.fastq.gz -fastq2 Sample_81279.R2.fastq.gz -humRef hs37d5.fa -virRef HPV.fa -output /home/scratch/HPV_fusion/Sample_81279 - searchHPV -assemble -fastq1 Sample_81279.R1.fastq.gz -fastq2 Sample_81279.R2.fastq.gz -humRef hs37d5.fa -virRef HPV.fa -output /home/scratch/HPV_fusion/Sample_81279 - searchHPV -hpvFusion -fastq1 Sample_81279.R1.fastq.gz -fastq2 Sample_81279.R2.fastq.gz -humRef hs37d5.fa -virRef HPV.fa -output /home/scratch/HPV_fusion/Sample_81279 - - ``` - Note: if run it step-by-step, please make sure the output directories for all steps are the same. - - ## Output - 1. Alignment: the marked dupliaction alignment bam file and customized reference genome.\\ - 2. Genome Fusion Point Calling: orignal callset, filtered callset, filtered clustered callset.\\ - 3. Assemble: supportive reads, contigs for each integration events (unfiltered).\\ - 4. HPV fusion Point Calling: alignment bam file for contigs againt human and HPV genome.\\ - Final outputs are under the folder "call_fusion_virus": - summary of all the integration events : "HPVfusionPointContig.txt" - contig sequences for all the integration events: "ContigsSequence.fa" - - ## Citation - SearcHPV: a novel approach to identify and assemble human papillomavirus-host genomic integration events in cancer --- In progress - - ## Contact - wenjingu@umich.edu - - - - - Keywords: HPV,integration,targeted capture sequencing -Platform: UNKNOWN Classifier: Development Status :: 3 - Alpha Classifier: License :: OSI Approved :: MIT License Classifier: Operating System :: Unix @@ -110,3 +17,166 @@ Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3 :: Only Requires-Python: >=3.5, <4 Description-Content-Type: text/markdown +License-File: LICENSE + +[![Documentation Status](https://readthedocs.org/projects/searchpv/badge/?version=stable)](https://searchpv.readthedocs.io/en/stable/?badge=stable) +[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://github.com/WenjinGudaisy/SearcHPV/blob/main/LICENSE) +[![PyPI version](https://badge.fury.io/py/searcHPV.svg)](https://badge.fury.io/py/searcHPV) +
+ +|Host | Downloads | +|:----|:---------:| +|PyPI | [![Downloads](https://pepy.tech/badge/searchpv)](https://pepy.tech/project/searchpv) + +# SearcHPV +An HPV integration point detection tool for targeted capture sequencing data + +## Introdution +* SearcHPV detects HPV fusion sites on both human genome and HPV genome +* SearcHPV is able to provide locally assembled contigs for each integration events. It will report at least one and at most two contigs for each integration sites. The two contigs will provide information captured for left and right sides of the event. + +## Getting started +1. Required resources +* Unix like environment + + +2. Download and install +Firstly, download and install the required resources. + 1) Download Anaconda >=4.11.0: https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html#install-linux-silent + + 2) Download the "environment.yaml" file under this repository + + 3) Creat conda environment for SearcHPV: + ``` + conda env create -f [your_path]/environment.yaml + + ``` + This command will automatically set up all the third-party tools and packages required for SearcHPV and install latest version of SearcHPV. The name of the environment is "searcHPV". + + You can check the packages and tools in this environment by: + + ``` + conda list -n searcHPV + + ``` + + You can update the environment by: + ``` + conda env update -f [your_path]/environment.yaml + + ``` + + + +3. Usage + +SearcHPV have four main steps. You could either run it start-to-finish or run it step-by-step. + +* Before running SearcHPV, active the conda environment: + +``` +conda activate searcHPV + +``` + +If you are running commands in a bash script, start with: + +``` +#!/bin/bash +source ~/anaconda3/etc/profile.d/conda.sh; +conda activate searcHPV; +#[searcHPV commands...] +``` +Note: Please check your path of "conda.sh" if you did not install Anaconda in the home directory. + +* Usage of searcHPV: + +``` +searcHPV ... +``` +* Standard options: +``` + -fastq1 sequencing data: fastq/fq.gz file + -fastq2 sequencing data: fastq/fq.gz file + -humRef human reference genome: fasta file + -virRef HPV reference genome: fasta file +``` +* Optional options: +``` +-h, --help show this help message and exit +-window the length of region searching for informative reads, default=300 +-output output directory, default "./" +-alignment run the alignment step, step1 +-genomeFusion call the genome fusion points, step2 +-assemble local assemble for each integration event, step3 +-hpvFusion call the HPV fusion points, step4 +-clusterWindow the length of window of clustering integration sites,default=100 +-gz if fastq files are in gz format +-poly(dn) N poly(n), n*d(A/T/C/G), will report low confidence if contig contains poly(n), default=20 +-index index the original human and virus reference files, default=False +``` + +Note: If you've already indexed the virus and human reference files for BWA, Samtools, Picard, you do not need to add the "-index" option, especailly when you are running for a batch of samples that share the same virus and human reference files and you do not want to spend time on indexing references every time running a sample. The commands for indexing the virus and human reference files: + +``` +#activate SearcHPV conda environment first to make sure using the correct versions of tools +ref = '[path_of_your_reference_file]' +bwa index {ref} +samtools faidx {ref} +picard CreateSequenceDictionary R={ref} O={ref.replace('.fa','.dict') +``` + + +4. Examples: + + 1) Run it start-to-finish and submit a SBATCH job: + ``` + #!/bin/bash + #SBATCH --job-name=searcHPV + #SBATCH --mail-user=wenjingu@umich.edu + #SBATCH --mail-type=BEGIN,END + #SBATCH --cpus-per-task=1 + #SBATCH --nodes=1 + #SBATCH --ntasks-per-node=8 + #SBATCH --mem=40gb + #SBATCH --time=100:00:00 + #SBATCH --account=XXXXX + #SBATCH --partition=standard + #SBATCH --output=searcHPV.log + #SBATCH --error=searcHPV.err + source ~/anaconda3/etc/profile.d/conda.sh; + conda activate searcHPV; + searcHPV -fastq1 Sample_81279.R1.fastq.gz -fastq2 Sample_81279.R2.fastq.gz -humRef hs37d5.fa -virRef HPV.fa -output /home/scratch/HPV_fusion/Sample_81279 -gz -index; + ``` + + + 2) Run it step-by-step: + + + ``` + searchHPV -alignment -fastq1 Sample_81279.R1.fastq.gz -fastq2 Sample_81279.R2.fastq.gz -humRef hs37d5.fa -virRef HPV.fa -output /home/scratch/HPV_fusion/Sample_81279 -gz -index + searchHPV -genomeFusion -fastq1 Sample_81279.R1.fastq.gz -fastq2 Sample_81279.R2.fastq.gz -humRef hs37d5.fa -virRef HPV.fa -output /home/scratch/HPV_fusion/Sample_81279 -gz + searchHPV -assemble -fastq1 Sample_81279.R1.fastq.gz -fastq2 Sample_81279.R2.fastq.gz -humRef hs37d5.fa -virRef HPV.fa -output /home/scratch/HPV_fusion/Sample_81279 -gz + searchHPV -hpvFusion -fastq1 Sample_81279.R1.fastq.gz -fastq2 Sample_81279.R2.fastq.gz -humRef hs37d5.fa -virRef HPV.fa -output /home/scratch/HPV_fusion/Sample_81279 -gz + + ``` + Note: if run it step-by-step, please make sure the output directories for all steps are the same. + +## Output +1. Alignment: the marked dupliaction alignment bam file and customized reference genome.\\ +2. Genome Fusion Point Calling: orignal callset, filtered callset, filtered clustered callset.\\ +3. Assemble: supportive reads, contigs for each integration events (unfiltered).\\ +4. HPV fusion Point Calling: alignment bam file for contigs againt human and HPV genome.\\ +Final outputs are under the folder "call_fusion_virus": +summary of all the integration events : "HPVfusionPointContig.txt" +contig sequences for all the integration events: "ContigsSequence.fa" + +## Citation +SearcHPV: a novel approach to identify and assemble human papillomavirus-host genomic integration events in cancer --- Accepted by Cancer + +## Contact +wenjingu@umich.edu + + + + diff --git a/searcHPV.egg-info/SOURCES.txt b/searcHPV.egg-info/SOURCES.txt index 4baf931..f6232dc 100644 --- a/searcHPV.egg-info/SOURCES.txt +++ b/searcHPV.egg-info/SOURCES.txt @@ -1,3 +1,4 @@ +LICENSE README.md setup.py ./searcHPV/__init__.py @@ -9,7 +10,6 @@ setup.py ./searcHPV/generate_call_fusion.py ./searcHPV/generate_call_fusion_virus.py ./searcHPV/genome_fusion.py -./searcHPV/hpv_fusion.py ./searcHPV/main.py ./searcHPV/selection_contig_call_virus_insertion.py searcHPV/__init__.py @@ -21,9 +21,9 @@ searcHPV/generate_assemble.py searcHPV/generate_call_fusion.py searcHPV/generate_call_fusion_virus.py searcHPV/genome_fusion.py -searcHPV/hpv_fusion.py searcHPV/main.py searcHPV/selection_contig_call_virus_insertion.py +searcHPV/virus_fusion.py searcHPV.egg-info/PKG-INFO searcHPV.egg-info/SOURCES.txt searcHPV.egg-info/dependency_links.txt diff --git a/searcHPV.egg-info/entry_points.txt b/searcHPV.egg-info/entry_points.txt index 41782e0..e13d66a 100644 --- a/searcHPV.egg-info/entry_points.txt +++ b/searcHPV.egg-info/entry_points.txt @@ -1,3 +1,2 @@ [console_scripts] searcHPV = searcHPV.main:main - diff --git a/searcHPV/__pycache__/__init__.cpython-38.pyc b/searcHPV/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..55baa69 Binary files /dev/null and b/searcHPV/__pycache__/__init__.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/alignment.cpython-38.pyc b/searcHPV/__pycache__/alignment.cpython-38.pyc new file mode 100644 index 0000000..61e5adb Binary files /dev/null and b/searcHPV/__pycache__/alignment.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/assemble.cpython-38.pyc b/searcHPV/__pycache__/assemble.cpython-38.pyc new file mode 100644 index 0000000..007fce2 Binary files /dev/null and b/searcHPV/__pycache__/assemble.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/general.cpython-38.pyc b/searcHPV/__pycache__/general.cpython-38.pyc new file mode 100644 index 0000000..f99d559 Binary files /dev/null and b/searcHPV/__pycache__/general.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/generate_alignment.cpython-38.pyc b/searcHPV/__pycache__/generate_alignment.cpython-38.pyc new file mode 100644 index 0000000..b20c653 Binary files /dev/null and b/searcHPV/__pycache__/generate_alignment.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/generate_assemble.cpython-38.pyc b/searcHPV/__pycache__/generate_assemble.cpython-38.pyc new file mode 100644 index 0000000..7367411 Binary files /dev/null and b/searcHPV/__pycache__/generate_assemble.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/generate_call_fusion.cpython-38.pyc b/searcHPV/__pycache__/generate_call_fusion.cpython-38.pyc new file mode 100644 index 0000000..25916a5 Binary files /dev/null and b/searcHPV/__pycache__/generate_call_fusion.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/generate_call_fusion_virus.cpython-38.pyc b/searcHPV/__pycache__/generate_call_fusion_virus.cpython-38.pyc new file mode 100644 index 0000000..67b7393 Binary files /dev/null and b/searcHPV/__pycache__/generate_call_fusion_virus.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/genome_fusion.cpython-38.pyc b/searcHPV/__pycache__/genome_fusion.cpython-38.pyc new file mode 100644 index 0000000..c685b33 Binary files /dev/null and b/searcHPV/__pycache__/genome_fusion.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/hpv_fusion.cpython-38.pyc b/searcHPV/__pycache__/hpv_fusion.cpython-38.pyc new file mode 100644 index 0000000..0e96a55 Binary files /dev/null and b/searcHPV/__pycache__/hpv_fusion.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/main.cpython-38.pyc b/searcHPV/__pycache__/main.cpython-38.pyc new file mode 100644 index 0000000..66a7781 Binary files /dev/null and b/searcHPV/__pycache__/main.cpython-38.pyc differ diff --git a/searcHPV/__pycache__/selection_contig_call_virus_insertion.cpython-38.pyc b/searcHPV/__pycache__/selection_contig_call_virus_insertion.cpython-38.pyc new file mode 100644 index 0000000..54076d8 Binary files /dev/null and b/searcHPV/__pycache__/selection_contig_call_virus_insertion.cpython-38.pyc differ diff --git a/searcHPV/alignment.py b/searcHPV/alignment.py index 02c8c06..c744166 100644 --- a/searcHPV/alignment.py +++ b/searcHPV/alignment.py @@ -13,7 +13,8 @@ #outputDir: output directory #multi: if fastq file is in gz format: default = True #index: if True, index the reference files; if False, not index the references files -def alignment(fq1, fq2, humRef, virRef, outputDir, index, gz): +#memory: memory size allocated +def alignment(fq1, fq2, humRef, virRef, outputDir, index, gz, memory,thread): #make output dir outputDir = os.path.abspath(outputDir) mkdir(outputDir) @@ -26,8 +27,8 @@ def alignment(fq1, fq2, humRef, virRef, outputDir, index, gz): alignmentFile = scriptDir + "/orignal.alignment.sh" indelFile = scriptDir + "/indel.alignment.sh" - generate_alignment_bash(alignmentFile,ref,fq1,fq2,scriptDir,gz) - generate_indel_alignment_bash(indelFile,ref,scriptDir) + generate_alignment_bash(alignmentFile,ref,fq1,fq2,scriptDir,memory,thread,gz) + generate_indel_alignment_bash(indelFile,ref,scriptDir, memory,thread) check_file(alignmentFile) check_file(indelFile) bashFile = scriptDir + f"/alignment.sh" @@ -51,7 +52,7 @@ def alignment(fq1, fq2, humRef, virRef, outputDir, index, gz): if index: ##generate mkdup alignment bash file mkdupFile = scriptDir + "/mkdup.alignment.sh" - generate_mkdup_bash(mkdupFile,scriptDir) + generate_mkdup_bash(mkdupFile,scriptDir,thread) with open(bashFile,'w') as output: output.write(f'''#!/bin/bash @@ -65,7 +66,7 @@ def alignment(fq1, fq2, humRef, virRef, outputDir, index, gz): else: ##generate mkdup alignment bash file mkdupFile = scriptDir + "/mkdup.alignment.sh" - generate_mkdup_bash(mkdupFile,scriptDir) + generate_mkdup_bash(mkdupFile,scriptDir,thread) with open(bashFile,'w') as output: output.write(f'''#!/bin/bash diff --git a/searcHPV/assemble.py b/searcHPV/assemble.py index 7564fb0..872d96a 100644 --- a/searcHPV/assemble.py +++ b/searcHPV/assemble.py @@ -9,7 +9,7 @@ #out_dir: output directory for searcHPV #virRef: virus reference genome #window: the length of region searching for informative reads, default=300 -def assemble(fq1, fq2, out_dir,virRef,gz,window): +def assemble(fq1, fq2, out_dir,virRef,gz,window,memory,thread): bam = f'{out_dir}/alignment/alignment.RG.indelre.mkdup.sort.bam' check_file(bam) assemble_out_dir = f'{out_dir}/assemble/' @@ -20,7 +20,7 @@ def assemble(fq1, fq2, out_dir,virRef,gz,window): subprocess.call(script_read_seq) #preprocessForPear(assemble_out_dir) - script_pear = pear(assemble_out_dir) + script_pear = pear(assemble_out_dir,memory,thread) os.system(f'chmod +x {script_pear}') subprocess.call(script_pear) diff --git a/searcHPV/generate_alignment.py b/searcHPV/generate_alignment.py index de32c69..a140426 100644 --- a/searcHPV/generate_alignment.py +++ b/searcHPV/generate_alignment.py @@ -42,21 +42,21 @@ def indexRef(bash_file,ref): #fq2: fastq2 file #out_dir: outputPath #gz: if fastq file is in gz format: default = True -def generate_alignment_bash(bash_file,ref,fq1,fq2,out_dir,gz = True): +def generate_alignment_bash(bash_file,ref,fq1,fq2,out_dir,memory,thread,gz = True): with open(bash_file,'w') as output: if gz: output.write(f''' - bwa mem {ref} ' {out_dir}/alignment.sam - samtools view -bhS {out_dir}/alignment.sam > {out_dir}/alignment.bam - samtools sort {out_dir}/alignment.bam -o {out_dir}/alignment.sort.bam + bwa mem -t {thread} {ref} ' {out_dir}/alignment.sam + samtools view -@ {thread} -bhS {out_dir}/alignment.sam > {out_dir}/alignment.bam + samtools sort -@ {thread} {out_dir}/alignment.bam -o {out_dir}/alignment.sort.bam rm {out_dir}/alignment.sam echo \'alignment done\' ''') else: output.write(f''' - bwa mem {ref} ' {out_dir}/alignment.sam - samtools view -bhS {out_dir}/alignment.sam > {out_dir}/alignment.bam - samtools sort {out_dir}/alignment.bam -o {out_dir}/alignment.sort.bam + bwa mem -t {thread} {ref} ' {out_dir}/alignment.sam + samtools view -@ {thread} -bhS {out_dir}/alignment.sam > {out_dir}/alignment.bam + samtools sort -@ {thread} {out_dir}/alignment.bam -o {out_dir}/alignment.sort.bam rm {out_dir}/alignment.sam echo \'alignment done\' ''') @@ -78,7 +78,7 @@ def generate_alignment_bash(bash_file,ref,fq1,fq2,out_dir,gz = True): #picard: full path of picard #gatk: full path of GATK #java: full path of java -def generate_indel_alignment_bash(bash_File,ref,out_dir): +def generate_indel_alignment_bash(bash_File,ref,out_dir,memory,thread): with open(bash_File,'w') as output: output.write(f'''picard \ AddOrReplaceReadGroups \ @@ -89,35 +89,37 @@ def generate_indel_alignment_bash(bash_File,ref,out_dir): RGPU=NA \ RGSM=sample \ RGLB=sample -samtools sort {out_dir}/alignment.RG.bam -o {out_dir}/alignment.RG.sort.bam -samtools index {out_dir}/alignment.RG.sort.bam +samtools sort -@ {thread} {out_dir}/alignment.RG.bam -o {out_dir}/alignment.RG.sort.bam +samtools index -@ {thread} {out_dir}/alignment.RG.sort.bam GenomeAnalysisTK \ +-Xmx{memory} \ -T RealignerTargetCreator \ -R {ref} \ -I {out_dir}/alignment.RG.sort.bam \ -o {out_dir}/alignment.RG.intervals GenomeAnalysisTK \ +-Xmx{memory} \ -T IndelRealigner \ -R {ref} \ -I {out_dir}/alignment.RG.sort.bam \ -targetIntervals {out_dir}/alignment.RG.intervals \ -o {out_dir}/alignment.RG.indelre.bam -samtools index {out_dir}/alignment.RG.indelre.bam +samtools index -@ {thread} {out_dir}/alignment.RG.indelre.bam echo \'indel alignment done\'''') -def generate_mkdup_bash(bash_File,out_dir): +def generate_mkdup_bash(bash_File,out_dir,thread): with open(bash_File,'w') as output: output.write(f''' -samtools sort -n {out_dir}/alignment.RG.indelre.bam -o {out_dir}/alignment.RG.indelre.sortbyQ.bam +samtools sort -@ {thread} -n {out_dir}/alignment.RG.indelre.bam -o {out_dir}/alignment.RG.indelre.sortbyQ.bam picard MarkDuplicates \ I={out_dir}/alignment.RG.indelre.sortbyQ.bam \ O={out_dir}/alignment.RG.indelre.mkdup.bam \ M={out_dir}/alignment.RG.indelre.mkdup.txt \ TAGGING_POLICY=All ASSUME_SORT_ORDER=queryname -samtools sort {out_dir}/alignment.RG.indelre.mkdup.bam -o {out_dir}/alignment.RG.indelre.mkdup.sort.bam -samtools index {out_dir}/alignment.RG.indelre.mkdup.sort.bam +samtools sort -@ {thread} {out_dir}/alignment.RG.indelre.mkdup.bam -o {out_dir}/alignment.RG.indelre.mkdup.sort.bam +samtools index -@ {thread} {out_dir}/alignment.RG.indelre.mkdup.sort.bam echo \'indel alignment done\'''') diff --git a/searcHPV/generate_assemble.py b/searcHPV/generate_assemble.py index 302d5a0..60c0da0 100644 --- a/searcHPV/generate_assemble.py +++ b/searcHPV/generate_assemble.py @@ -3,6 +3,7 @@ import re import pysam from searcHPV.general import * +import string ######################## #get informative reads (SP + PE) from bam file @@ -67,7 +68,9 @@ def extract_read_name(bam,out_dir,virRef,window): if '>' in each: virus_chrm_list.append(each.split()[0].replace('>','')) for virus_chrm in virus_chrm_list: - fusionRes = f'{out_dir}/call_fusion/{virus_chrm}.all.filtered.clustered.result' + invalidCharacter = re.escape(string.punctuation) + virus_chrm_file_name = re.sub(r'['+invalidCharacter+']',"_",virus_chrm) + fusionRes = f'{out_dir}/call_fusion/{virus_chrm_file_name}.all.filtered.clustered.result' check_file(fusionRes) with open(fusionRes) as candidate_in: candidate_in = candidate_in.read().rstrip() @@ -172,7 +175,7 @@ def preprocessForPear(out_dir): #Function:generate bash script for PEAR #out_dir:output directory for assemble #return:path of bash script -def pear(out_dir): +def pear(out_dir,memory,thread): listSites = os.listdir(out_dir) with open(f'{out_dir}/pear.sh','w') as output: output.write('#!/bin/bash\n') @@ -184,6 +187,8 @@ def pear(out_dir): os.mkdir(outputPath) output.write(f''' pear \ + -j {thread} \ + -y {memory} \ -f {fqPath}/{site}.informativeReads.1.fq \ -r {fqPath}/{site}.informativeReads.2.fq \ -o {outputPath}/{site}''') diff --git a/searcHPV/generate_call_fusion.py b/searcHPV/generate_call_fusion.py index 464c8ff..d9fae38 100644 --- a/searcHPV/generate_call_fusion.py +++ b/searcHPV/generate_call_fusion.py @@ -2,6 +2,8 @@ import sys from searcHPV.general import * import re +import string +import re ################## #find the end of each cigar string slot @@ -153,12 +155,13 @@ def define_fusion(bam,virus_chrm,out_dir): if read.next_reference_name == virus_chrm: paired_evidence_count += 1 b[i] += (str(paired_evidence_count),) - - with open(f'{out_dir}/{virus_chrm}.genome_fusion.txt','w') as output: + invalidCharacter = re.escape(string.punctuation) + virus_chrm_file_name = re.sub(r'['+invalidCharacter+']',"_",virus_chrm) + with open(f'{out_dir}/{virus_chrm_file_name}.genome_fusion.txt','w') as output: output.write('chrm\tpos\tsingle_evidence\tpaired_evidence\n') for i in b: output.write(f'{i[0]}\t{i[1]}\t{i[2]}\t{i[3]}\n') - return f'{out_dir}/{virus_chrm}.genome_fusion.txt' + return f'{out_dir}/{virus_chrm_file_name}.genome_fusion.txt' ################## #cluster fusion points within certain base pair diff --git a/searcHPV/generate_call_fusion_virus.py b/searcHPV/generate_call_fusion_virus.py index bdc860f..715ae1b 100644 --- a/searcHPV/generate_call_fusion_virus.py +++ b/searcHPV/generate_call_fusion_virus.py @@ -8,7 +8,7 @@ #out_dir:out_dir for searcHPV #humRef: human reference genome #return: path of bash script -def mapToRef(out_dir): +def mapToRef(out_dir,thread): newRef = f'{out_dir}/hg_hpv.fa' sitesPath = f'{out_dir}/assemble/' listSites = os.listdir(sitesPath) @@ -23,11 +23,11 @@ def mapToRef(out_dir): contigPath = f'{sitesPath}/{site}/pearOutput/' sitePath = f'{out_dir}/call_fusion_virus/{site}/' mkdir(sitePath) - bashFile.write( f'''bwa mem -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ + bashFile.write( f'''bwa mem -t {thread} -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ {newRef} {contigPath}/{site}.all.fa.cap.contigs > {sitePath}/{site}.contig.sam; -samtools view -bhS {sitePath}/{site}.contig.sam > {sitePath}/{site}.contig.bam; -samtools sort {sitePath}/{site}.contig.bam -o {sitePath}/{site}.contig.sort.bam; -samtools index {sitePath}/{site}.contig.sort.bam; +samtools view -@ {thread} -bhS {sitePath}/{site}.contig.sam > {sitePath}/{site}.contig.bam; +samtools sort -@ {thread} {sitePath}/{site}.contig.bam -o {sitePath}/{site}.contig.sort.bam; +samtools index -@ {thread} {sitePath}/{site}.contig.sort.bam; rm {sitePath}/{site}.contig.bam;\n''') return f'{outputPath}/alignContigsToRef.sh' @@ -39,7 +39,7 @@ def mapToRef(out_dir): #out_dir:out_dir for searcHPV #humRef: human reference genome #return: path of bash script -def mapToHgRef(out_dir,humRef): +def mapToHgRef(out_dir,humRef,thread): sitesPath = f'{out_dir}/assemble/' listSites = os.listdir(sitesPath) outputPath = f'{out_dir}/call_fusion_virus/' @@ -53,11 +53,11 @@ def mapToHgRef(out_dir,humRef): contigPath = f'{sitesPath}/{site}/pearOutput/' sitePath = f'{out_dir}/call_fusion_virus/{site}/' mkdir(sitePath) - bashFile.write( f'''bwa mem -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ + bashFile.write( f'''bwa mem -t {thread} -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ {humRef} {contigPath}/{site}.all.fa.cap.contigs > {sitePath}/{site}.contigToGenome.sam; -samtools view -bhS {sitePath}/{site}.contigToGenome.sam > {sitePath}/{site}.contigToGenome.bam; -samtools sort {sitePath}/{site}.contigToGenome.bam -o {sitePath}/{site}.contigToGenome.sort.bam; -samtools index {sitePath}/{site}.contigToGenome.sort.bam; +samtools view -@ {thread} -bhS {sitePath}/{site}.contigToGenome.sam > {sitePath}/{site}.contigToGenome.bam; +samtools sort -@ {thread} {sitePath}/{site}.contigToGenome.bam -o {sitePath}/{site}.contigToGenome.sort.bam; +samtools index -@ {thread} {sitePath}/{site}.contigToGenome.sort.bam; rm {sitePath}/{site}.contigToGenome.bam;\n''') return f'{outputPath}/alignContigsToGenome.sh' @@ -66,7 +66,7 @@ def mapToHgRef(out_dir,humRef): #out_dir:out_dir for searcHPV #virRef: virus reference genome #return: path of bash script -def mapToVirRef(out_dir,virRef): +def mapToVirRef(out_dir,virRef,thread): sitesPath = f'{out_dir}/assemble/' listSites = os.listdir(sitesPath) outputPath = f'{out_dir}/call_fusion_virus/' @@ -79,12 +79,12 @@ def mapToVirRef(out_dir,virRef): if ".sh" not in site: contigPath = f'{sitesPath}/{site}/pearOutput/' sitePath = f'{out_dir}/call_fusion_virus/{site}/' - bashFile.write( f'''bwa mem -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ + bashFile.write( f'''bwa mem -t {thread} -R \'@RG\\tID:hpv\\tSM:hpv\\tLB:hpv\\tPL:ILLUMINA\' -M -t 8 \ {virRef} {contigPath}/{site}.all.fa.cap.contigs > {contigPath}/{site}.contigToVirus.sam; -samtools view -bhS {contigPath}/{site}.contigToVirus.sam > {contigPath}/{site}.contigToVirus.bam; -samtools sort {contigPath}/{site}.contigToVirus.bam -o {sitePath}/{site}.contigToVirus.sort.bam; -samtools index {sitePath}/{site}.contigToVirus.sort.bam; -samtools faidx {contigPath}/{site}.all.fa.cap.contigs; +samtools view -@ {thread} -bhS {contigPath}/{site}.contigToVirus.sam > {contigPath}/{site}.contigToVirus.bam; +samtools sort -@ {thread} {contigPath}/{site}.contigToVirus.bam -o {sitePath}/{site}.contigToVirus.sort.bam; +samtools index -@ {thread} {sitePath}/{site}.contigToVirus.sort.bam; +samtools faidx -@ {thread} {contigPath}/{site}.all.fa.cap.contigs; rm {contigPath}/{site}.contigToVirus.bam;\n''') return f'{outputPath}/alignContigsToVirus.sh' diff --git a/searcHPV/genome_fusion.py b/searcHPV/genome_fusion.py index be8685f..36e419e 100644 --- a/searcHPV/genome_fusion.py +++ b/searcHPV/genome_fusion.py @@ -2,6 +2,7 @@ import sys from searcHPV.general import * from searcHPV.generate_call_fusion import * +import string def genomeFusion(window,out_dir,virRef): @@ -24,7 +25,10 @@ def genomeFusion(window,out_dir,virRef): #filter and cluster fusion points ##sort result - os.system(f'(head -n 1 {res} && tail -n +2 {res} | sort -k3,3rn) > {out_dir}/{virus_chrm}.genome_fusion.sort.txt') + #if there are invalid characters in the chromosome name: + invalidCharacter = re.escape(string.punctuation) + virus_chrm_file_name = re.sub(r'['+invalidCharacter+']',"_",virus_chrm) + os.system(f'(head -n 1 {res} && tail -n +2 {res} | sort -k3,3rn) > {out_dir}/{virus_chrm_file_name}.genome_fusion.sort.txt') #change format for cluster @@ -32,9 +36,9 @@ def genomeFusion(window,out_dir,virRef): chrm_li = list(map(str,chrm_li)) chrm_li+=['X','Y'] - with open(f'{out_dir}/{virus_chrm}.all.result','w') as output: + with open(f'{out_dir}/{virus_chrm_file_name}.all.result','w') as output: fusion_li = [] - with open(f'{out_dir}/{virus_chrm}.genome_fusion.sort.txt') as res: + with open(f'{out_dir}/{virus_chrm_file_name}.genome_fusion.sort.txt') as res: res.readline() for line in res.read().rstrip().split('\n'): elements = line.rstrip().split('\t') @@ -51,12 +55,12 @@ def genomeFusion(window,out_dir,virRef): output.write(f'{to_print}') ##cluster the events within 100bp from each other, maybe becasue of SVs or CNVs - cluster_result(f'{out_dir}/{virus_chrm}.all.result',f'{out_dir}/{virus_chrm}.all.clustered.result',window) + cluster_result(f'{out_dir}/{virus_chrm_file_name}.all.result',f'{out_dir}/{virus_chrm_file_name}.all.clustered.result',window) ##filter for sites with at least 2 split read and 2 pairs of read support(high cutoff) and their summation greater than 5 - with open(f'{out_dir}/{virus_chrm}.all.clustered.result') as inf: - with open(f'{out_dir}/{virus_chrm}.all.filtered.clustered.result','w') as outf: + with open(f'{out_dir}/{virus_chrm_file_name}.all.clustered.result') as inf: + with open(f'{out_dir}/{virus_chrm_file_name}.all.filtered.clustered.result','w') as outf: for line in inf.read().rstrip().split('\n'): elements = line.rstrip() if elements == "": diff --git a/searcHPV/main.py b/searcHPV/main.py index 24b6c82..fd41f4d 100644 --- a/searcHPV/main.py +++ b/searcHPV/main.py @@ -61,6 +61,13 @@ def main(): default = False, dest ='index', help ="index the original human and virus reference files, default=False") + parser.add_argument('-memory', type = str, default="10G",dest = 'memory', + help ='specify the maximum size, default "10G"', + ) + parser.add_argument('-thread', type = int, default=1,dest = 'thread', + help ='number of threads, default 1', + ) + @@ -72,19 +79,19 @@ def main(): if args.alignment: - alignment(fq1 = args.fq1, fq2 = args.fq2, humRef = args.humRef, virRef = args.virRef, index = args.index, outputDir = args.outputDir, gz = args.gz) + alignment(fq1 = args.fq1, fq2 = args.fq2, humRef = args.humRef, virRef = args.virRef, index = args.index, outputDir = args.outputDir, gz = args.gz, memory = args.memory,thread = args.thread) elif args.genomeFusion: genomeFusion(args.clusterWindow,args.outputDir,args.virRef) elif args.assemble: #check result from genomeFusion - assemble(args.fq1, args.fq2, args.outputDir,args.virRef,args.gz,args.window) + assemble(args.fq1, args.fq2, args.outputDir,args.virRef,args.gz,args.window, memory = args.memory,thread = args.thread) elif args.hpvFusion: #check result from assemble - virus_fusion(args.humRef,args.virRef,args.outputDir,args.n) + virus_fusion(args.humRef,args.virRef,args.outputDir,args.n,thread = args.thread) else: - alignment(fq1 = args.fq1, fq2 = args.fq2, humRef = args.humRef, virRef = args.virRef, index = args.index, outputDir = args.outputDir, gz = args.gz) + alignment(fq1 = args.fq1, fq2 = args.fq2, humRef = args.humRef, virRef = args.virRef, index = args.index, outputDir = args.outputDir, gz = args.gz, memory = args.memory,thread = args.thread) genomeFusion(args.clusterWindow,args.outputDir,args.virRef) - assemble(args.fq1, args.fq2, args.outputDir,args.virRef,args.gz,args.window) - virus_fusion(args.humRef,args.virRef,args.outputDir,args.n) + assemble(args.fq1, args.fq2, args.outputDir,args.virRef,args.gz,args.window, memory = args.memory,thread = args.thread) + virus_fusion(args.humRef,args.virRef,args.outputDir,args.n,thread = args.thread) \ No newline at end of file diff --git a/searcHPV/virus_fusion.py b/searcHPV/virus_fusion.py index 42a1562..89f3acf 100644 --- a/searcHPV/virus_fusion.py +++ b/searcHPV/virus_fusion.py @@ -9,10 +9,10 @@ #virRef: virus reference genome #out_dir: output directory for seacHPV #n: poly(n), n*d(A/T/C/G), will report low confidence if contig contains poly(n) -def virus_fusion(humRef,virRef,out_dir,n): - script_map = mapToRef(out_dir) - script_mapHg = mapToHgRef(out_dir,humRef) - script_mapVir = mapToVirRef(out_dir,virRef) +def virus_fusion(humRef,virRef,out_dir,n,thread): + script_map = mapToRef(out_dir,thread) + script_mapHg = mapToHgRef(out_dir,humRef,thread) + script_mapVir = mapToVirRef(out_dir,virRef,thread) os.system(f'chmod +x {script_map}') subprocess.call(script_map) os.system(f'chmod +x {script_mapHg}') diff --git a/setup.py b/setup.py index bf1cb18..8d9f77d 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ # For a discussion on single-sourcing the version across setup.py and the # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.0.10', # Required + version='1.0.17', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the "Summary" metadata field: