-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_profiles.sh
144 lines (127 loc) · 4.86 KB
/
generate_profiles.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/bin/bash
#
# Type 2 Diabetes Adipocyte Morphology
# Gregory Way, 2019
#
# Generate cell painting profiles for downstream analyses
############################
# Step 1 - Configure the Environment
############################
# Step 1.1: Setup a virtual machine on AWS
#
# * Launch an ec2 instance on AWS
# * AMI: cytomining/images/hvm-ssd/cytominer-ubuntu-trusty-14.04-amd64-server-1529668435
# * Instance Type: m4.xlarge
# * Network: vpc-35149752 - Subnet: Default (imaging platform terraform)
# * IAM role: `s3-imaging-platform-role
# * Add New Volume (if necessary): `EBS` with 110 GiB
# * No Tags
# * Select Existing Security Group: `SSH_HTTP`
# * Review and Launch
# * ssh -i <USER>.pem ubuntu@<Public DNS IPv4>
# * Inside AWS terminal: `aws configure` and input security credentials
#
# See https://cytomining.github.io/profiling-handbook/configure-environment.html#set-up-a-virtual-machine
# for more details
# Step 1.2: Define Variables
PROJECT_NAME=2018_04_12_T2D_V2F_Saadat_Broad
BATCH_ID=2019_04_16_Batch1
BUCKET=imaging-platform
MAXPROCS=3 # m4.xlarge has 4 cores; keep 1 free
mkdir -p ~/efs/${PROJECT_NAME}/workspace/
cd ~/efs/${PROJECT_NAME}/workspace/
mkdir -p log/${BATCH_ID}
PLATES=$(readlink -f ~/efs/${PROJECT_NAME}/workspace/scratch/${BATCH_ID}/plates_to_process.txt)
# Step 1.3 - Create an EBS temp directory for creating the backend
mkdir ~/ebs_tmp
############################
# Step 2 - Configure Tools to Process Images
############################
cd ~/efs/${PROJECT_NAME}/workspace/
mkdir software
cd software
git clone [email protected]:broadinstitute/cytominer_scripts.git
############################
# Step 3 - Annotate
############################
# NOTE - The annotate step creates `augmented` profiles in the `backend` folder
# `augmented` profiles represent aggregated per-well data annotated with metadata
# Retrieve metadata information
aws s3 sync s3://${BUCKET}/projects/${PROJECT_NAME}/workspace/metadata/${BATCH_ID}/ ~/efs/${PROJECT_NAME}/workspace/metadata/${BATCH_ID}/
# Use cytominer_scripts to run annotation
cd ~/efs/${PROJECT_NAME}/workspace/software/cytominer_scripts
parallel \
--no-run-if-empty \
--eta \
--joblog ../../log/${BATCH_ID}/annotate.log \
--results ../../log/${BATCH_ID}/annotate \
--files \
--keep-order \
./annotate.R \
--batch_id ${BATCH_ID} \
--plate_id {1} :::: ${PLATES}
############################
# Step 4 - Normalize
############################
# Note - The normalize step creates `normalized` profiles in the `backend` folder
# The step z-scores each feature using all wells (i.e. use all "non-dummy" wells)
parallel \
--no-run-if-empty \
--eta \
--joblog ../../log/${BATCH_ID}/normalize.log \
--results ../../log/${BATCH_ID}/normalize \
--files \
--keep-order \
./normalize.R \
--batch_id ${BATCH_ID} \
--plate_id {1} \
--subset \"Metadata_Well != \'\'\'dummy\'\'\'\" :::: ${PLATES}
############################
# Step 5 - Variable Selection
############################
# Note - Variable selection uses both normalized and unnormalized data
mkdir -p ../../parameters/${BATCH_ID}/sample/
# Step 5.0 - Sample normalized and unnormalized data
# Normalized
./sample.R \
--batch_id ${BATCH_ID} \
--pattern "_normalized.csv" \
--replicates 2 \
--output ../../parameters/${BATCH_ID}/sample/${BATCH_ID}_normalized_sample.feather
# Unnormalized
./sample.R \
--batch_id ${BATCH_ID} \
--pattern "_augmented.csv" \
--replicates 2 \
--output ../../parameters/${BATCH_ID}/sample/${BATCH_ID}_augmented_sample.feather
# Using the sampled feather files, perform a series of three variable selection steps
# Step 5.1 - Remove variables that have high correlations with other variables
./preselect.R \
--batch_id ${BATCH_ID} \
--input ../../parameters/${BATCH_ID}/sample/${BATCH_ID}_normalized_sample.feather \
--operations correlation_threshold
# Step 5.2 - Remove variables that have low variance
./preselect.R \
--batch_id ${BATCH_ID} \
--input ../../parameters/${BATCH_ID}/sample/${BATCH_ID}_augmented_sample.feather \
--operations variance_threshold
# Step 5.3 - Remove features known to be noisy
SAMPLE_PLATE_ID='BR00101075'
echo "variable" > ../../parameters/${BATCH_ID}/variable_selection/manual.txt
head -1 \
../../backend/${BATCH_ID}/${SAMPLE_PLATE_ID}/${SAMPLE_PLATE_ID}.csv \
|tr "," "\n"|grep -v Meta|grep -E -v 'Granularity_14|Granularity_15|Granularity_16|Manders|RWC' >> \
../../parameters/${BATCH_ID}/variable_selection/manual.txt
# Step 5.4 - Apply the variable selection steps to the profiles
# Note - This creates the _normalized_variable_selected.csv files in `backend`
parallel \
--no-run-if-empty \
--eta \
--joblog ../../log/${BATCH_ID}/select.log \
--results ../../log/${BATCH_ID}/select \
--files \
--keep-order \
./select.R \
--batch_id ${BATCH_ID} \
--plate_id {1} \
--filters variance_threshold,correlation_threshold,manual :::: ${PLATES}