diff --git a/Scripts/blastx_pipeline.sh b/Scripts/blastx_pipeline.sh new file mode 100644 index 0000000..7abb097 --- /dev/null +++ b/Scripts/blastx_pipeline.sh @@ -0,0 +1,12 @@ +#Arg1 = cds of a crop to make the database for blast +#Arg2 = original 'lncrna' fastas to be filtered +cds=$1 +query=$2 +makeblastdb -dbtype nucl -in $cds +blastn -query $query -db $1 -outfmt 6 -evalue 1e-10 > blast_results.txt +python extract_blast.py blast_results.txt +uniq blast_results.txt_to_filter.txt > transcripts_to_filter.txt +awk 'BEGIN{while((getline<"transcripts_to_filter.txt")>0)l[">"$1]=1}/^>/{f=!l[$1]}f' $2 > transcripts_filtered.fa +wc -l $2 +wc -l transcripts_filtered.fa +wc -l transcripts_to_filter.txt diff --git a/Scripts/extract_blast.py b/Scripts/extract_blast.py new file mode 100644 index 0000000..8782fca --- /dev/null +++ b/Scripts/extract_blast.py @@ -0,0 +1,19 @@ +import pandas as pd +import sys +my_file = sys.argv[1] +print(my_file) +col_heads = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"] +df = pd.read_csv(my_file, sep = "\t", header=None) +df.columns = col_heads +outfile = my_file+"_to_filter.txt" +f = open(outfile, "w") +for index, row in df.iterrows(): + length = row["length"] + cell = row["qseqid"] + cell = cell.split(':') + cell = cell[3].split('-') + l = int(cell[1]) - int(cell[0]) + l = l*0.9 + if(length >= l): + f.write(row["qseqid"] + "\n") +f.close()