v0.8.5

PombertLab · Sep 23, 2021 · f0b44bf · f0b44bf
1 parent fbec8ce
commit f0b44bf
Show file tree

Hide file tree

Showing 31 changed files with 1,203 additions and 808 deletions.
diff --git a/Homology_search/PDB_headers.pl b/Homology_search/PDB_headers.pl
@@ -1,11 +1,15 @@
 #!/usr/bin/perl
 ## Pombert Lab 2020
-my $version = '0.3c';
+my $version = '0.4';
 my $name = 'PDB_headers.pl';
-my $updated = '2021-07-25';
+my $updated = '2021-09-21';
 
-use strict; use warnings; use Getopt::Long qw(GetOptions); use File::Basename;
-use File::Find; use PerlIO::gzip; 
+use strict;
+use warnings;
+use Getopt::Long qw(GetOptions);
+use File::Basename;
+use File::Find;
+use PerlIO::gzip; 
 
 ## Usage definition
 my $USAGE = <<"OPTIONS";
@@ -32,11 +36,11 @@
 
 ## Defining options
 my $pdb;
-my $out;
+my $rcsb_list;
 my $verbose = 1000;
 GetOptions(
 	'p|pdb=s' => \$pdb,
-	'o|output=s' => \$out,
+	'o|output=s' => \$rcsb_list,
 	'v|verbose=i' => \$verbose
 );
 
@@ -49,8 +53,25 @@
 	$dir
 );
 
+## Doing a first pass to see which files have been parsed previously
+## Should reduce overall computation time by skipping parsing
+my %previous_data;
+my $diamond = '>';
+if (-f $rcsb_list){
+	$diamond = '>>'; 
+	open LIST, "<", "$rcsb_list" or die "Can't read $rcsb_list: $!\n";
+	while (my $line = <LIST>){
+		chomp $line;
+		if ($line =~ /^(\w+)/){
+			my $rcsb_entry = $1;
+			$previous_data{$rcsb_entry} = 1;
+		}
+	}
+	close LIST;
+}
+
 ## Parsing PDB files (*.ent.gz)
-open OUT, ">", "$out" or die "Can't create file $out: $!\n";
+open OUT, "$diamond", "$rcsb_list" or die "Can't create file $rcsb_list: $!\n";
 
 my $pdb_count = 0;
 my $start = time;
@@ -60,75 +81,81 @@
 
 		$pdb_count++;
 
-		open PDB, "<:gzip", "$pb" or die "Can't open file $pb: $!\n";
+		## Grabbing RCSB PDB entry name from pdb file
 		my ($pdb, $folder) = fileparse($pb);
 		$pdb =~ s/^pdb//;
 		$pdb =~ s/.ent.gz$//;
-		my $title = undef;
-		my %molecules;
-		my $mol_id = undef;
 
 		## verbosity; lots of files to parse...
 		my $modulo = ($pdb_count % $verbose);
 		my $current_count = commify($pdb_count);
 		if ($modulo == 0){ print "Working on PDB file # $current_count: $pb\n"; }
 
-		while (my $line = <PDB>){
-			chomp $line;
-			## Getting title info from TITLE entries
-			if ($line =~ /^TITLE\s{5}(.*)$/){
-				my $key = $1;
-				$key =~ s/\s+$/ /; ## Discard trailing space characters
-				$title .= $key;
-			}
-			elsif ($line =~ /^TITLE\s+\d+\s(.*)$/){
-				my $key = $1;
-				$key =~ s/\s+$/ /; ## Discard trailing space characters
-				$title .= $key;
-			}
-			## Getting chain information from COMPND entries
-			elsif ($line =~ /^COMPND\s+(\d+)?\s?MOL_ID:\s(\d+)/){
-				$mol_id = $2;
-			}
-			elsif ($line =~ /^COMPND\s+(\d+)?(.*)$/){
-				my $data = $2;
-				$data =~ s/\s+$//;
-				$molecules{$mol_id} .= $data;
-			}
-		}
-		binmode PDB, ":gzip(none)";
-
-		## Printing title
-		print OUT "$pdb\tTITLE\t$title\n";
-
-		## Printing chain(s)
-		foreach my $id (sort (keys %molecules)){
-
-			my $molecule;
-			my $chains;
-
-			if ($molecules{$id} =~ /MOLECULE: (.*?);/){
-				$molecule = $1;
-			}
-			elsif($molecules{$id} =~ /MOLECULE: (.*?)/){
-				$molecule = $1;
-				## If at end of COMPND section, no semicolon to after the molecule(s)
+		## Working on PDB if is has not been seen before
+		if (exists $previous_data{$pdb}) { next; }
+		else {
+			open PDB, "<:gzip", "$pb" or die "Can't open file $pb: $!\n";
+			my $title = undef;
+			my %molecules;
+			my $mol_id = undef;
+
+			while (my $line = <PDB>){
+				chomp $line;
+				## Getting title info from TITLE entries
+				if ($line =~ /^TITLE\s{5}(.*)$/){
+					my $key = $1;
+					$key =~ s/\s+$/ /; ## Discard trailing space characters
+					$title .= $key;
+				}
+				elsif ($line =~ /^TITLE\s+\d+\s(.*)$/){
+					my $key = $1;
+					$key =~ s/\s+$/ /; ## Discard trailing space characters
+					$title .= $key;
+				}
+				## Getting chain information from COMPND entries
+				elsif ($line =~ /^COMPND\s+(\d+)?\s?MOL_ID:\s(\d+)/){
+					$mol_id = $2;
+				}
+				elsif ($line =~ /^COMPND\s+(\d+)?(.*)$/){
+					my $data = $2;
+					$data =~ s/\s+$//;
+					$molecules{$mol_id} .= $data;
+				}
 			}
-
-			if ($molecules{$id} =~ /CHAIN: (.*?);/){
-				$chains = $1;
-			}
-			elsif ($molecules{$id} =~ /CHAIN: (.*?)/){
-				$chains = $1;
-				## If at end of COMPND section, no semicolon to after the chain(s)
-			}
-
-			$chains =~ s/ //g;
-			my @chains = split (",", $chains);
-			foreach my $chain (@chains){
-				if ($molecule){	print OUT "$pdb\t$chain\t$molecule\n"; }
-				## Molecules might not be defined if engineered
-				else { print OUT "$pdb\t$chain\tundefined molecule\n"; }
+			binmode PDB, ":gzip(none)";
+
+			## Printing title
+			print OUT "$pdb\tTITLE\t$title\n";
+
+			## Printing chain(s)
+			foreach my $id (sort (keys %molecules)){
+
+				my $molecule;
+				my $chains;
+
+				if ($molecules{$id} =~ /MOLECULE: (.*?);/){
+					$molecule = $1;
+				}
+				elsif($molecules{$id} =~ /MOLECULE: (.*?)/){
+					$molecule = $1;
+					## If at end of COMPND section, no semicolon to after the molecule(s)
+				}
+
+				if ($molecules{$id} =~ /CHAIN: (.*?);/){
+					$chains = $1;
+				}
+				elsif ($molecules{$id} =~ /CHAIN: (.*?)/){
+					$chains = $1;
+					## If at end of COMPND section, no semicolon to after the chain(s)
+				}
+
+				$chains =~ s/ //g;
+				my @chains = split (",", $chains);
+				foreach my $chain (@chains){
+					if ($molecule){	print OUT "$pdb\t$chain\t$molecule\n"; }
+					## Molecules might not be defined if engineered
+					else { print OUT "$pdb\t$chain\tundefined molecule\n"; }
+				}
 			}
 		}
 	}

diff --git a/Homology_search/parse_all_models_by_Q.pl b/Homology_search/parse_all_models_by_Q.pl
@@ -4,7 +4,10 @@
 my $version = '0.1a';
 my $updated = '2021-09-13';
 
-use strict; use warnings; use Getopt::Long qw(GetOptions); use File::Basename;
+use strict;
+use warnings;
+use Getopt::Long qw(GetOptions);
+use File::Basename;
 
 my $usage = <<"USAGE";
 NAME		${name}
@@ -14,16 +17,16 @@
 		from best Q-score to worst.
 
 EXAMPLE		${name} \\
-		  -m *.GESAMT_per_model.matches \\
+		  -m *_GESAMT_per_model.matches \\
 		  -o All_GESAMT_matches_per_protein.tsv \\
 		  -x 50
 
 GENERAL OPTIONS:
 -m (--matches)	*.GESAMT.matches generated by descriptive_GESAMT_matches.pl
 -o (--out)	Output file in TSV format [Default: All_GESAMT_matches_per_protein.tsv]
 -x (--max)	Max number of distinct RCSB/chain hits to keep [Default: 50]
--r (--redun)	 Keep all entries for redundant RCSB chains [Default: off]
--w (--word)		Use word regular expression (\\w+) to capture locus tag [Default: off]
+-r (--redun)	Keep all entries for redundant RCSB chains [Default: off]
+-w (--word)	Use word regular expression (\\w+) to capture locus tag [Default: off]
 USAGE
 die "\n$usage\n" unless @ARGV;
 

diff --git a/Homology_search/update_PDB.pl b/Homology_search/update_PDB.pl
@@ -4,7 +4,9 @@
 my $name = 'update_PDB.pl';
 my $updated = '2021-05-17';
 
-use strict; use warnings; use Getopt::Long qw(GetOptions);
+use strict;
+use warnings;
+use Getopt::Long qw(GetOptions);
 
 ## Usage definition
 my $USAGE = <<"OPTIONS";

diff --git a/Misc_tools/rename_files.pl b/Misc_tools/rename_files.pl
@@ -4,7 +4,9 @@
 my $name = 'rename_files.pl';
 my $updated = '2021-03-12';
 
-use strict; use warnings; use Getopt::Long qw(GetOptions);
+use strict;
+use warnings;
+use Getopt::Long qw(GetOptions);
 
 ## Usage definition
 my $USAGE = <<"OPTIONS";

diff --git a/Misc_tools/split_Fasta.pl b/Misc_tools/split_Fasta.pl
@@ -4,7 +4,10 @@
 my $name = 'split_Fasta.pl';
 my $updated = '2021-09-07';
 
-use strict; use warnings; use PerlIO::gzip; use Getopt::Long qw(GetOptions);
+use strict;
+use warnings;
+use Getopt::Long qw(GetOptions);
+use PerlIO::gzip;
 
 ## Usage definition
 my $USAGE = <<"OPTIONS";

diff --git a/Misc_tools/split_PDB.pl b/Misc_tools/split_PDB.pl
@@ -4,8 +4,10 @@
 my $name = 'split_PDB.pl';
 my $updated = '2021-04-06';
 
-use strict; use warnings;
-use PerlIO::gzip; use File::Basename;
+use strict;
+use warnings;
+use PerlIO::gzip;
+use File::Basename;
 use Getopt::Long qw(GetOptions);
 
 ## Usage definition

diff --git a/Notes/Installation_notes.sh b/Notes/Installation_notes.sh
@@ -0,0 +1,46 @@
+## On Fedora 34
+##### Installing Aria2, Conda and Docker #####
+sudo dnf install aria2 conda docker
+
+## Starting Docker / enabling at boot
+sudo systemctl start docker
+sudo systemctl enable docker.service
+sudo systemctl enable containerd.service
+
+## Testing docker
+sudo docker run hello-world
+
+## Creating docker group + add user(s) to it + activate docker group changes
+sudo groupadd docker
+sudo usermod -aG docker $USER
+newgrp docker
+
+## Installing NVIDIA docker container - https://www.if-not-true-then-false.com/2020/install-nvidia-container-toolkit-on-fedora/
+sudo wget -O /etc/yum.repos.d/inttf.repo https://rpms.if-not-true-then-false.com/inttf.repo
+sudo dnf install nvidia-docker2
+
+## Modifying configuration file
+sudo nano /etc/nvidia-container-runtime/config.toml
+# Remove comment from 'no-cgroups = false' and change to 'no-cgroups = true'
+# Under [nvidia-container-runtime]
+# Remove comment (enable): debug = "/var/log/nvidia-container-runtime.log"
+
+## Restarting Docker and checking that nvidia configuration is working
+systemctl restart docker
+nvidia-container-cli info
+docker run \
+	--privileged \
+	--gpus all \
+	--rm nvidia/cuda:11.1-base \
+	nvidia-smi
+
+## Make sure to add --privileged to prevent 'Failed to initialize NVML: Unknown Error'
+## Add user(s) to docker group
+
+## Testing capabilities with sample
+docker run \
+	--privileged \
+	--gpus all \
+	--rm nvcr.io/nvidia/k8s/cuda-sample:nbody nbody \
+	-benchmark \
+	-numbodies=512000