From f102e9ab566aca3fdc79f2e9055aef87aeb4e005 Mon Sep 17 00:00:00 2001 From: soumyakundu Date: Tue, 23 Jan 2024 18:23:20 +0000 Subject: [PATCH] Saving progress on MAF encoding of variants --- chrombpnet.egg-info/PKG-INFO | 9 ++ chrombpnet.egg-info/SOURCES.txt | 91 +++++++++++++++++++ chrombpnet.egg-info/dependency_links.txt | 1 + chrombpnet.egg-info/entry_points.txt | 3 + chrombpnet.egg-info/not-zip-safe | 1 + chrombpnet.egg-info/requires.txt | 19 ++++ chrombpnet.egg-info/top_level.txt | 1 + .../data_generators/batchgen_generator.py | 8 +- .../training/data_generators/initializers.py | 6 +- chrombpnet/training/train.py | 4 +- chrombpnet/training/utils/argmanager.py | 2 +- chrombpnet/training/utils/data_utils.py | 27 +++++- chrombpnet/training/utils/one_hot.py | 49 ++++++++++ 13 files changed, 210 insertions(+), 11 deletions(-) create mode 100644 chrombpnet.egg-info/PKG-INFO create mode 100644 chrombpnet.egg-info/SOURCES.txt create mode 100644 chrombpnet.egg-info/dependency_links.txt create mode 100644 chrombpnet.egg-info/entry_points.txt create mode 100644 chrombpnet.egg-info/not-zip-safe create mode 100644 chrombpnet.egg-info/requires.txt create mode 100644 chrombpnet.egg-info/top_level.txt diff --git a/chrombpnet.egg-info/PKG-INFO b/chrombpnet.egg-info/PKG-INFO new file mode 100644 index 00000000..9f6cc55d --- /dev/null +++ b/chrombpnet.egg-info/PKG-INFO @@ -0,0 +1,9 @@ +Metadata-Version: 2.1 +Name: chrombpnet +Version: 0.1.5 +Summary: chrombpnet predicts chromatin accessibility from sequence +Download-URL: https://github.com/kundajelab/chrombpnet +Author-email: anusri @ stanford.edu +License: MIT +Requires-Python: >=3.8 +License-File: LICENSE diff --git a/chrombpnet.egg-info/SOURCES.txt b/chrombpnet.egg-info/SOURCES.txt new file mode 100644 index 00000000..427da5dd --- /dev/null +++ b/chrombpnet.egg-info/SOURCES.txt @@ -0,0 +1,91 @@ +LICENSE +MANIFEST.in +README.md +requirements.txt +setup.py +chrombpnet/CHROMBPNET.py +chrombpnet/__init__.py +chrombpnet/parsers.py +chrombpnet/pipelines.py +chrombpnet.egg-info/PKG-INFO +chrombpnet.egg-info/SOURCES.txt +chrombpnet.egg-info/dependency_links.txt +chrombpnet.egg-info/entry_points.txt +chrombpnet.egg-info/not-zip-safe +chrombpnet.egg-info/requires.txt +chrombpnet.egg-info/top_level.txt +chrombpnet/data/ATAC.ref.motifs.txt +chrombpnet/data/DNASE.ref.motifs.txt +chrombpnet/data/__init__.py +chrombpnet/data/motif_to_pwm.ATAC.tsv +chrombpnet/data/motif_to_pwm.DNASE.tsv +chrombpnet/data/motif_to_pwm.TF.tsv +chrombpnet/data/motifs.meme.txt +chrombpnet/evaluation/__init__.py +chrombpnet/evaluation/interpret/__init__.py +chrombpnet/evaluation/interpret/input_utils.py +chrombpnet/evaluation/interpret/interpret.py +chrombpnet/evaluation/interpret/shap_utils.py +chrombpnet/evaluation/invivo_footprints/__init__.py +chrombpnet/evaluation/invivo_footprints/run_tfmodisco.py +chrombpnet/evaluation/invivo_footprints/tf_modiscohits.py +chrombpnet/evaluation/make_bigwigs/__init__.py +chrombpnet/evaluation/make_bigwigs/bigwig_helper.py +chrombpnet/evaluation/make_bigwigs/importance_hdf5_to_bigwig.py +chrombpnet/evaluation/make_bigwigs/predict_to_bigwig.py +chrombpnet/evaluation/marginal_footprints/__init__.py +chrombpnet/evaluation/marginal_footprints/marginal_footprinting.py +chrombpnet/evaluation/modisco/__init__.py +chrombpnet/evaluation/modisco/convert_html_to_pdf.py +chrombpnet/evaluation/modisco/fetch_tomtom.py +chrombpnet/evaluation/modisco/modisco.sh +chrombpnet/evaluation/modisco/run_modisco.py +chrombpnet/evaluation/modisco/visualize_motif_matches.py +chrombpnet/evaluation/variant_effect_prediction/__init__.py +chrombpnet/evaluation/variant_effect_prediction/snp_generator.py +chrombpnet/evaluation/variant_effect_prediction/snp_scoring.py +chrombpnet/evaluation/variant_effect_prediction/testing.py +chrombpnet/helpers/__init__.py +chrombpnet/helpers/generate_reports/__init__.py +chrombpnet/helpers/generate_reports/make_html.py +chrombpnet/helpers/generate_reports/make_html_bias.py +chrombpnet/helpers/hyperparameters/__init__.py +chrombpnet/helpers/hyperparameters/find_bias_hyperparams.py +chrombpnet/helpers/hyperparameters/find_chrombpnet_hyperparams.py +chrombpnet/helpers/hyperparameters/param_utils.py +chrombpnet/helpers/make_chr_splits/__init__.py +chrombpnet/helpers/make_chr_splits/splits.py +chrombpnet/helpers/make_gc_matched_negatives/__init__.py +chrombpnet/helpers/make_gc_matched_negatives/get_gc_content.py +chrombpnet/helpers/make_gc_matched_negatives/get_gc_matched_negatives.py +chrombpnet/helpers/make_gc_matched_negatives/make_gc_matched_negatives.sh +chrombpnet/helpers/make_gc_matched_negatives/get_genomewide_gc_buckets/__init__.py +chrombpnet/helpers/make_gc_matched_negatives/get_genomewide_gc_buckets/get_genomewide_gc_bins.py +chrombpnet/helpers/preprocessing/__init__.py +chrombpnet/helpers/preprocessing/auto_shift_detect.py +chrombpnet/helpers/preprocessing/reads_to_bigwig.py +chrombpnet/helpers/preprocessing/analysis/__init__.py +chrombpnet/helpers/preprocessing/analysis/build_pwm_from_bigwig.py +chrombpnet/training/__init__.py +chrombpnet/training/metrics.py +chrombpnet/training/predict.py +chrombpnet/training/train.py +chrombpnet/training/data_generators/__init__.py +chrombpnet/training/data_generators/batchgen_generator.py +chrombpnet/training/data_generators/initializers.py +chrombpnet/training/models/__init__.py +chrombpnet/training/models/bpnet_model.py +chrombpnet/training/models/chrombpnet_with_bias_model.py +chrombpnet/training/utils/__init__.py +chrombpnet/training/utils/argmanager.py +chrombpnet/training/utils/augment.py +chrombpnet/training/utils/callbacks.py +chrombpnet/training/utils/data_utils.py +chrombpnet/training/utils/losses.py +chrombpnet/training/utils/metrics_utils.py +chrombpnet/training/utils/one_hot.py +tests/full_workflow.sh +tests/genomewide_gc_bin_test.sh +tests/test_pred_to_bigwig.sh +workflows/train_bias_model.sh +workflows/train_chrombpnet_model.sh \ No newline at end of file diff --git a/chrombpnet.egg-info/dependency_links.txt b/chrombpnet.egg-info/dependency_links.txt new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/chrombpnet.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/chrombpnet.egg-info/entry_points.txt b/chrombpnet.egg-info/entry_points.txt new file mode 100644 index 00000000..26ec6379 --- /dev/null +++ b/chrombpnet.egg-info/entry_points.txt @@ -0,0 +1,3 @@ +[console_scripts] +chrombpnet = chrombpnet.CHROMBPNET:main +print_meme_motif_file = chrombpnet.data.__init__:print_meme_motif_file diff --git a/chrombpnet.egg-info/not-zip-safe b/chrombpnet.egg-info/not-zip-safe new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/chrombpnet.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/chrombpnet.egg-info/requires.txt b/chrombpnet.egg-info/requires.txt new file mode 100644 index 00000000..2656630a --- /dev/null +++ b/chrombpnet.egg-info/requires.txt @@ -0,0 +1,19 @@ +h5py>=2.10.0 +matplotlib>=3.3.1 +matplotlib-venn==0.11.6 +numpy==1.23.4 +pandas>=1.3.4 +pyfaidx==0.6.1 +scikit-learn>=1.1.2 +scipy>=1.4.1 +tensorflow==2.8.0 +tensorflow-estimator==2.8.0 +tensorflow-probability==0.15.0 +protobuf==3.20 +tqdm==4.48.2 +deepdish==0.3.7 +deeplift==0.6.13.0 +modisco==0.5.16.0 +modisco-lite==2.0.7 +weasyprint==52.5 +kundajelab-shap==1 diff --git a/chrombpnet.egg-info/top_level.txt b/chrombpnet.egg-info/top_level.txt new file mode 100644 index 00000000..a55b8bb4 --- /dev/null +++ b/chrombpnet.egg-info/top_level.txt @@ -0,0 +1 @@ +chrombpnet diff --git a/chrombpnet/training/data_generators/batchgen_generator.py b/chrombpnet/training/data_generators/batchgen_generator.py index f5111079..edf2d6da 100644 --- a/chrombpnet/training/data_generators/batchgen_generator.py +++ b/chrombpnet/training/data_generators/batchgen_generator.py @@ -8,6 +8,7 @@ import math import os import json +import pandas as pd def subsample_nonpeak_data(nonpeak_seqs, nonpeak_cts, nonpeak_coords, peak_data_size, negative_sampling_ratio): #Randomly samples a portion of the non-peak data to use in training @@ -24,7 +25,9 @@ class ChromBPNetBatchGenerator(keras.utils.Sequence): every epoch, and calls bias model on it, whose outputs (bias profile logits and bias logcounts) are fed as input to the chrombpnet model. """ - def __init__(self, peak_regions, nonpeak_regions, genome_fasta, batch_size, inputlen, outputlen, max_jitter, negative_sampling_ratio, cts_bw_file, add_revcomp, return_coords, shuffle_at_epoch_start): + def __init__(self, peak_regions, nonpeak_regions, genome_fasta, + batch_size, inputlen, outputlen, max_jitter, negative_sampling_ratio, + cts_bw_file, add_revcomp, return_coords, shuffle_at_epoch_start, maf): """ seqs: B x L' x 4 cts: B x M' @@ -33,7 +36,8 @@ def __init__(self, peak_regions, nonpeak_regions, genome_fasta, batch_size, inpu batch_size: int (B) """ - peak_seqs, peak_cts, peak_coords, nonpeak_seqs, nonpeak_cts, nonpeak_coords, = data_utils.load_data(peak_regions, nonpeak_regions, genome_fasta, cts_bw_file, inputlen, outputlen, max_jitter) + peak_seqs, peak_cts, peak_coords, nonpeak_seqs, nonpeak_cts, nonpeak_coords, = data_utils.load_data(peak_regions, nonpeak_regions, genome_fasta, + cts_bw_file, inputlen, outputlen, max_jitter, maf=maf) self.peak_seqs, self.nonpeak_seqs = peak_seqs, nonpeak_seqs self.peak_cts, self.nonpeak_cts = peak_cts, nonpeak_cts self.peak_coords, self.nonpeak_coords = peak_coords, nonpeak_coords diff --git a/chrombpnet/training/data_generators/initializers.py b/chrombpnet/training/data_generators/initializers.py index f074fd30..bfa890b8 100644 --- a/chrombpnet/training/data_generators/initializers.py +++ b/chrombpnet/training/data_generators/initializers.py @@ -2,6 +2,7 @@ from chrombpnet.training.utils import data_utils import pandas as pd import json +import tabix NARROWPEAK_SCHEMA = ["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"] @@ -74,6 +75,8 @@ def initialize_generators(args, mode, parameters, return_coords): nonpeak_regions=pd.read_csv(args.nonpeaks,header=None,sep='\t',names=NARROWPEAK_SCHEMA) nonpeak_regions, chroms=get_bed_regions_for_fold_split(nonpeak_regions, mode, splits_dict) + maf_file = '/users/soumyak/maf_encoding/eur.hg38.frq.formatted.tsv.gz' + inputlen, outputlen, \ nonpeak_regions, negative_sampling_ratio, \ max_jitter, add_revcomp, shuffle_at_epoch_start = fetch_data_and_model_params_based_on_mode(mode, args, parameters, nonpeak_regions, peak_regions) @@ -89,7 +92,8 @@ def initialize_generators(args, mode, parameters, return_coords): cts_bw_file=args.bigwig, add_revcomp=add_revcomp, return_coords=return_coords, - shuffle_at_epoch_start=shuffle_at_epoch_start + shuffle_at_epoch_start=shuffle_at_epoch_start, + maf=maf_file ) return generator diff --git a/chrombpnet/training/train.py b/chrombpnet/training/train.py index a87bc59f..c2a04d65 100644 --- a/chrombpnet/training/train.py +++ b/chrombpnet/training/train.py @@ -33,8 +33,8 @@ def fit_and_evaluate(model,train_gen,valid_gen,args,architecture_module): earlystopper = tfcallbacks.EarlyStopping(monitor='val_loss', mode="min", patience=args.early_stop, verbose=1, restore_best_weights=True) history= callbacks.LossHistory(model_output_path_logs_name+".batch",args.trackables) csvlogger = tfcallbacks.CSVLogger(model_output_path_logs_name, append=False) - #reduce_lr = tfcallbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.4, patience=args.early_stop-2, min_lr=0.00000001) - cur_callbacks=[checkpointer,earlystopper,csvlogger,history] + reduce_lr = tfcallbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.4, patience=4, min_lr=0.00000001) + cur_callbacks=[checkpointer,earlystopper,csvlogger,history,reduce_lr] model.fit(train_gen, validation_data=valid_gen, diff --git a/chrombpnet/training/utils/argmanager.py b/chrombpnet/training/utils/argmanager.py index 5445e826..02f636a9 100644 --- a/chrombpnet/training/utils/argmanager.py +++ b/chrombpnet/training/utils/argmanager.py @@ -11,7 +11,7 @@ def update_data_args(parser): def update_train_args(parser): parser.add_argument("-e", "--epochs", type=int, default=50, help="Maximum epochs to train") - parser.add_argument("-es", "--early-stop", type=int, default=5, help="Early stop limit, corresponds to 'patience' in callback") + parser.add_argument("-es", "--early-stop", type=int, default=10, help="Early stop limit, corresponds to 'patience' in callback") parser.add_argument("-bs", "--batch_size", type=int, default=64) parser.add_argument("-l", "--learning-rate", type=float, default=0.001) parser.add_argument("-pf", "--params", type=str, required=True, default=None) diff --git a/chrombpnet/training/utils/data_utils.py b/chrombpnet/training/utils/data_utils.py index 261fd64e..c389c3bd 100644 --- a/chrombpnet/training/utils/data_utils.py +++ b/chrombpnet/training/utils/data_utils.py @@ -17,6 +17,21 @@ def get_seq(peaks_df, genome, width): return one_hot.dna_to_one_hot(vals) +def get_seq_maf(peaks_df, genome, width, maf): + """ + Same as get_cts, but fetches sequence from a given genome. + """ + vals = [] + peak_coords = [] + + for i, r in peaks_df.iterrows(): + # Fetch sequence from genome + sequence = str(genome[r['chr']][(r['start'] + r['summit'] - width // 2):(r['start'] + r['summit'] + width // 2)]) + vals.append(sequence) + peak_coords.append((r['chr'], r['start'] + r['summit'] - width // 2, r['start'] + r['summit'] + width // 2)) + + # Apply one-hot encoding considering SNPs + return one_hot.dna_to_one_hot_maf(vals, peak_coords, maf) def get_cts(peaks_df, bw, width): """ @@ -45,14 +60,14 @@ def get_coords(peaks_df, peaks_bool): return np.array(vals) -def get_seq_cts_coords(peaks_df, genome, bw, input_width, output_width, peaks_bool): +def get_seq_cts_coords(peaks_df, genome, bw, input_width, output_width, peaks_bool, maf): - seq = get_seq(peaks_df, genome, input_width) + seq = get_seq_maf(peaks_df, genome, input_width, maf=maf) cts = get_cts(peaks_df, bw, output_width) coords = get_coords(peaks_df, peaks_bool) return seq, cts, coords -def load_data(bed_regions, nonpeak_regions, genome_fasta, cts_bw_file, inputlen, outputlen, max_jitter): +def load_data(bed_regions, nonpeak_regions, genome_fasta, cts_bw_file, inputlen, outputlen, max_jitter, maf): """ Load sequences and corresponding base resolution counts for training, validation regions in peaks and nonpeaks (2 x 2 x 2 = 8 matrices). @@ -81,7 +96,8 @@ def load_data(bed_regions, nonpeak_regions, genome_fasta, cts_bw_file, inputlen, cts_bw, inputlen+2*max_jitter, outputlen+2*max_jitter, - peaks_bool=1) + peaks_bool=1, + maf=maf) if nonpeak_regions is not None: train_nonpeaks_seqs, train_nonpeaks_cts, train_nonpeaks_coords = get_seq_cts_coords(nonpeak_regions, @@ -89,7 +105,8 @@ def load_data(bed_regions, nonpeak_regions, genome_fasta, cts_bw_file, inputlen, cts_bw, inputlen, outputlen, - peaks_bool=0) + peaks_bool=0, + maf=maf) diff --git a/chrombpnet/training/utils/one_hot.py b/chrombpnet/training/utils/one_hot.py index 4fadf92a..4840a54b 100644 --- a/chrombpnet/training/utils/one_hot.py +++ b/chrombpnet/training/utils/one_hot.py @@ -5,6 +5,7 @@ """ import numpy as np +import tabix def dna_to_one_hot(seqs): """ @@ -36,6 +37,54 @@ def dna_to_one_hot(seqs): # Get the one-hot encoding for those indices, and reshape back to separate return one_hot_map[base_inds[:-4]].reshape((len(seqs), seq_len, 4)) +def dna_to_one_hot_maf(seqs, peak_coords, maf_file): + """ + Converts a list of DNA ("ACGT") sequences to one-hot encodings, where the + position of 1s is ordered alphabetically by "ACGT". `seqs` must be a list + of N strings, where every string is the same length L. Returns an N x L x 4 + NumPy array of one-hot encodings, in the same order as the input sequences. + All bases will be converted to upper-case prior to performing the encoding. + Any bases that are not "ACGT" will be given an encoding of all 0s. + """ + seq_len = len(seqs[0]) + assert np.all(np.array([len(s) for s in seqs]) == seq_len) + + # Join all sequences together into one long string, all uppercase + seq_concat = "".join(seqs).upper() + "ACGT" + # Add one example of each base, so np.unique doesn't miss indices later + + one_hot_map = np.identity(5)[:, :-1].astype(np.int8) + + # Convert string into array of ASCII character codes; + base_vals = np.frombuffer(bytearray(seq_concat, "utf8"), dtype=np.int8) + + # Anything that's not an A, C, G, or T gets assigned a higher code + base_vals[~np.isin(base_vals, np.array([65, 67, 71, 84]))] = 85 + + # Convert the codes into indices in [0, 4], in ascending order by code + _, base_inds = np.unique(base_vals, return_inverse=True) + + # Get the one-hot encoding for those indices, and reshape back to separate + one_hot_encoding = one_hot_map[base_inds[:-4]].reshape((len(seqs), seq_len, 4)) + + maf = tabix.open(maf_file) + + # Update one-hot encoding based on SNP information + for seq_index, (seq_chrom, seq_start, seq_end) in enumerate(peak_coords): + # print(seq_index, seq_chrom, seq_start, seq_end) + if seq_chrom in ['chr' + str(x) for x in range(1,23)]: + matches = maf.query(seq_chrom, seq_start, seq_end) + if matches: + # print(matches) + for match in matches: + ref_allele_idx = "ACGT".index(match[2]) + minor_allele_idx = "ACGT".index(match[3]) + pos_index = int(match[1]) - 1 - seq_start + one_hot_encoding[seq_index, pos_index, :] = 0.0 + one_hot_encoding[seq_index, pos_index, ref_allele_idx] = 1.0 - float(match[4]) + one_hot_encoding[seq_index, pos_index, minor_allele_idx] = float(match[4]) + + return one_hot_encoding def one_hot_to_dna(one_hot): """