From de6f6fd4ab1ca5acbef8988c160a8ab116cc47c0 Mon Sep 17 00:00:00 2001
From: mitchgill16 <39039751+mitchgill16@users.noreply.github.com>
Date: Sun, 23 Jan 2022 11:48:25 +0800
Subject: [PATCH] Blastx relevant scripts uploaded

---
 Scripts/blastx_pipeline.sh | 12 ++++++++++++
 Scripts/extract_blast.py   | 19 +++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 Scripts/blastx_pipeline.sh
 create mode 100644 Scripts/extract_blast.py

diff --git a/Scripts/blastx_pipeline.sh b/Scripts/blastx_pipeline.sh
new file mode 100644
index 0000000..7abb097
--- /dev/null
+++ b/Scripts/blastx_pipeline.sh
@@ -0,0 +1,12 @@
+#Arg1 = cds of a crop to make the database for blast
+#Arg2 = original 'lncrna' fastas to be filtered
+cds=$1
+query=$2
+makeblastdb -dbtype nucl -in $cds
+blastn -query $query -db $1 -outfmt 6 -evalue 1e-10 > blast_results.txt
+python extract_blast.py blast_results.txt
+uniq blast_results.txt_to_filter.txt > transcripts_to_filter.txt      
+awk 'BEGIN{while((getline<"transcripts_to_filter.txt")>0)l[">"$1]=1}/^>/{f=!l[$1]}f' $2 > transcripts_filtered.fa
+wc -l $2
+wc -l transcripts_filtered.fa
+wc -l transcripts_to_filter.txt
diff --git a/Scripts/extract_blast.py b/Scripts/extract_blast.py
new file mode 100644
index 0000000..8782fca
--- /dev/null
+++ b/Scripts/extract_blast.py
@@ -0,0 +1,19 @@
+import pandas as pd
+import sys
+my_file = sys.argv[1]
+print(my_file)
+col_heads = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
+df = pd.read_csv(my_file, sep = "\t", header=None)
+df.columns = col_heads
+outfile = my_file+"_to_filter.txt"
+f = open(outfile, "w")
+for index, row in df.iterrows():
+   length = row["length"]
+   cell = row["qseqid"]
+   cell = cell.split(':')
+   cell = cell[3].split('-')
+   l = int(cell[1]) - int(cell[0])
+   l = l*0.9
+   if(length >= l):
+       f.write(row["qseqid"] + "\n")
+f.close()