Merge pull request #177 from woook/master

upgrade integrity check
moka-guys · Jan 24, 2018 · 95ea3f2 · 95ea3f2
2 parents 82f5907 + acd2a23
commit 95ea3f2
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 57 deletions.
diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py
@@ -9,7 +9,8 @@
 debug = False
 
 # =====git release for the automate_demultiplexing repo=====
-script_release = "v8.0"
+script_release = "v9.0"
+
 
 # =====location of input/output files=====
 # path to run folders

diff --git a/sequencer_checksum.py b/sequencer_checksum.py
@@ -8,18 +8,10 @@
 
 class Nextseq_Integrity_Check():
 	def __init__(self):
-		# path to the temp runfolder to be populated depending on sequencer
-		self.temp_folder = ""
 
-		# use the existence of expected folders to determine the sequencer
 		# temp folder on the nextseq
 		self.nextseqtemp_folder = "D://Illumina//NextSeq Control Software Temp"
-		# temp folder on the miseq
-		self.miseqtemp_folder = "D://Illumina//MiSeqTemp"
-		# flag to be used if it's the nextseq
-		self.nextseq=""
-
-
+
 		# path to the mapped workstation share
 		self.mapped_workstation_folder = "Z://"
 
@@ -42,17 +34,17 @@ def __init__(self):
 		self.workstation_runfolder = ""
 		self.sequencer_runfolder = ""
 
+		# files to ignore from checksum
+		self.exclude = ["RTAStart.bat", "CorrectedIntMetrics.bin", "EmpiricalPhasingMetrics.bin", "ErrorMetrics.bin", "EventMetrics.bin", "ExtractionMetrics.bin", "PFGridMetrics.bin", "QMetrics.bin", "RegistrationMetrics.bin", "TileMetrics.bin", "000_000_000_na_rtabat.trans", "FilesAdded.csv", "FilesCopied.csv", "md5checksum.txt"]
 		# flag to set testing		
-		self.testing=False
+		self.testing = False
 
 		# if testing overwrite the paths to that of the testing 
 		if self.testing:
 			# drive letter given to usb stick
 			self.mapped_drive = "E://"
 			# path to the fake nextseqtemp folder
 			self.nextseqtemp_folder = self.mapped_drive + "integrity_testing//sequencer_temp"
-			# path to the fake miseqtemp folder
-			self.miseqtemp_folder = self.mapped_drive + "integrity_testing//sequencer_temp"
 			# path to the fake workstation folder
 			self.mapped_workstation_folder = self.mapped_drive + "integrity_testing//workstation"
 			# path to the fake checksums_inprogress folder
@@ -64,28 +56,12 @@ def look_for_folder(self):
 		"""
 		This script runs every hour.
 		The script needs to detect when a run has started, and display a window which remains until the integrity test has been performed.
-		This function identifies which sequencer the script is running on and then looks to see if a run has finished which is not already being monitored.
-		If it's a newly started run, display a window to say not to do anything until sequencing is complete and integrity checks done.
+		Display a window to say not to do anything until sequencing is complete and integrity checks done.
 		When checksums are done, display a message box displaying pass/fail messages.
 		"""
-		# check what sequencer it is.
-		# look if the nextseq temp folder exists
-		if os.path.isdir(self.nextseqtemp_folder):
-			# set the temp folder to the nextseq temp folder path
-			self.temp_folder = self.nextseqtemp_folder
-			# set flag to denote nextseq
-			self.nextseq = True
-		# if the nextseq folder doesn't exist the miseq temp folder should
-		elif os.path.isdir(self.miseqtemp_folder):
-			# set the temp folder to the miseq temp folder path
-			self.temp_folder = self.miseqtemp_folder
-		else:
-			# if neither path exists raise an error
-			raise UserWarning("Can't determine what sequencer this is!")
-
 
 		# for each runfolder in temp folder
-		for temp_runfolder in os.listdir(self.temp_folder):
+		for temp_runfolder in os.listdir(self.nextseqtemp_folder):
 			# look to see if the run has already been caught by this script at a previous time
 			if temp_runfolder in os.listdir(self.run_in_progress):
 				# skip this temp_runfolder
@@ -104,15 +80,23 @@ def look_for_folder(self):
 					new_run_marker.write(str(datetime.datetime.now()))
 
 				# call function which opens a window to say run in progress - don't do anything until a message box appears denoting integrity check has been performed
-				# this function will close when the run ends and (if nextseq) the checksum has been calculated
+				# this function will close when the run ends and the checksum has been calculated
 				self.open_window()
 
 				# call function to assess result of checksum and display message box
 				# if checksums match (integrity test pass) return a info box
 				if self.check_checksums():
+					# create root window which can then be hidden
+					root = Tkinter.Tk()
+					# hide 
+					root.withdraw()
 					tkMessageBox.showinfo("Integrity check complete","Integrity check passed")
 					# if checksums don't match (integrity test FAIL) return a error box
 				else:
+					# create root window which can then be hidden
+					root = Tkinter.Tk()
+					# hide 
+					root.withdraw()
 					tkMessageBox.showerror("Integrity check complete","Integrity check failed - please do not use this sequencer and inform the Bioinformatics team immediately")
 
 
@@ -152,7 +136,7 @@ def run_has_finished(self):
 		If required the checksums are generated, or if not the script waits until the checksums have been generated (by the demultiplexing script).
 		"""
 		# build path to the runfolder
-		self.sequencer_runfolder = os.path.join(self.temp_folder, self.runfolder)
+		self.sequencer_runfolder = os.path.join(self.nextseqtemp_folder, self.runfolder)
 		# build paths on the workstation
 		self.workstation_runfolder = os.path.join(self.mapped_workstation_folder, self.runfolder)
 		#flag to denote run and data transfer has finished
@@ -163,25 +147,13 @@ def run_has_finished(self):
 			if self.RTA_complete in os.listdir(self.sequencer_runfolder) and self.RTA_complete in os.listdir(self.workstation_runfolder):
 					# if it's a testing run print a message
 					if self.testing:
-						print "run finished"
-					# now run has finished and is transfered can calculate the checksum (if required) - do this here so window remains until checksum has finished.
-					if self.nextseq:
-						# if it's a testing run print a message
-						if self.testing:
-							print "NextSeq"
-						# call function which triggers the checksum calculations
-						self.prepare_checksum_calculations()
+						print "run finished - skipping 2 hour wait"
 					else:
-						# if it's a miseq run wait to see if checksum has been done
-						while not self.checksums_done():
-							# if it's a testing run don't wait as long and print a message
-							if self.testing:
-								print "miseq run, waiting for checksums"
-								time.sleep(20)
-							else:
-								# wait 5 minutes
-								time.sleep(300)
-
+						# sleep 2 hours to ensure all file transfers are done
+						time.sleep(7200)
+
+					# call function which triggers the checksum calculations
+					self.prepare_checksum_calculations()
 					# now all checksums are done change flag to true so the loop finishes and the window is closed
 					finished = True
 
@@ -201,7 +173,7 @@ def run_has_finished(self):
 
 	def prepare_checksum_calculations(self):
 		"""
-		On the nextseq the checksums are calculated by this script.
+		The checksums are calculated by this script.
 		This function checks the runfolder has not already been checksummed, marks the folder as being checksummed and then calls the function to generate the checksums.
 		"""
 		if self.testing:
@@ -227,21 +199,24 @@ def prepare_checksum_calculations(self):
 	def run_integrity_check(self):
 		"""
 		This function uses self variables for the filepaths and calculates the checksums on directories.
+		It looks for the presense of any files which should be ignored as they are not copied from temp to output.
 		The checksums are written to a file on the workstation for the demultiplexing script.
 		"""
 		if self.testing:
 			print "starting integrity checking"
-		# calculate md5 checksums using dirhash package for both folders
-		workstation_checksum = dirhash(self.workstation_runfolder, 'md5')
-		sequencer_checksum = dirhash(self.sequencer_runfolder, 'md5')
+
+		# calculate the checksum, using the to_exclude list
+		workstation_checksum = dirhash(self.workstation_runfolder, 'md5',excluded_files=self.exclude)
+		sequencer_checksum = dirhash(self.sequencer_runfolder, 'md5',excluded_files=self.exclude)
+
 		if self.testing:
 			print "workstation checksum = " + workstation_checksum
 			print "sequencer checksum = " + sequencer_checksum	   
 
 		# write the checksums to the output file (on workstation)
 		with open(os.path.join(self.workstation_runfolder, self.output_file), 'w') as outputfile:
-			outputfile.write("workstation checksum (" + self.workstation_runfolder + ")=" + workstation_checksum+"\n")
-			outputfile.write("sequencer checksum (" + self.sequencer_runfolder + ")=" + sequencer_checksum)
+			outputfile.write("workstation checksum (" + self.workstation_runfolder + ")=" + workstation_checksum + "\n")
+			outputfile.write("sequencer checksum (" + self.sequencer_runfolder + ")=" + sequencer_checksum + "\n")
 
 
 	def checksums_done(self):