-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnwx_analyze.py
389 lines (362 loc) · 16.6 KB
/
nwx_analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
#!/usr/bin/env python
# coding: utf-8
# original author: Mattis Knulst
# email: [email protected]
# contributors: Elias Carlsson, Tobias Westholm
import argparse
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from libpysal import weights
from alive_progress import alive_bar
import multiprocessing as mp
import warnings
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('-i', '--file', type=str, help='path to the csv file',
required=True)
parser.add_argument('-o', '--results_dir',
type=str,
help='path to the results directory'
', default: current directory',
default='.')
parser.add_argument('-c', '--critical_distance',
type=int, help='critical distance in pixels, default=30, if pixel size is 0.4 µm, the critical distance is 30*0.4=12 µm',
default=30)
parser.add_argument('-p', '--pair', type=str,
help='cell type pair', required=True)
parser.add_argument('-s', '--sep', help='csv separator, default tab',
default='\t')
parser.add_argument('-d', '--decimal', help='float decimal sign, default .',
default='.')
parser.add_argument('-t', '--tiff_dir', type=str,
help='path to the tiff directory, not used in this version, default: current directory', default='.')#required=True) - take away default to specify
parser.add_argument('-n', '--n_workers', type=int, default=1,
help="number of processes to use")
def cluster_cooccurrence(df, G, critical_distance, cell_types, iterations=5):
"""
This function calculates the cluster cooccurence, by creating a randomly
distributed coordinate system with cell types using the max and min values
from the corresponding columns in the data. Then it counts the average
number of edges over a set of critical distances and divides that with
with the same for the random case.
:param df: dataframe
returns: cluster cooccurence value
"""
# get the max and min values for x and y coordinates
x_min = df.loc[:,'X'].min()
x_max = df.loc[:,'X'].max()
y_min = df.loc[:,'Y'].min()
y_max = df.loc[:,'Y'].max()
# get number of rows
n_rows = df.shape[0]
#find all the classes of cells in the dataframe
unique_cells = df.Class.unique()
# pick out cell type 1
cell_type_1 = cell_types[0]
# pick out cell type 2
cell_type_2 = cell_types[1]
results = {
'between_all_ccr': [],
'within_1_ccr': [],
'within_2_ccr': [],
}
for i in range(iterations):
# create a random coordinate system
random_x = np.random.randint(x_min, x_max, size=(n_rows, 1))
random_y = np.random.randint(y_min, y_max, size=(n_rows, 1))
# create a dataframe with the random coordinates and name columns
random_coordinates = pd.DataFrame(
np.concatenate((random_x, random_y), axis=1),
columns=["X", "Y"])
# add the cell types to the random coordinates
random_coordinates["Class"] = df["Class"]
# create a graph from the random coordinates
random_graph, random_weights = make_graph(random_coordinates,
critical_distance)
if len(unique_cells) > 1:
# For cells of different cell types
# only count edges between cells of different types
edges = [edge for edge in G.edges if G.nodes[edge[0]]['cell_type'] !=
G.nodes[edge[1]]['cell_type']]
# same thing for random graph
random_edges = [edge for edge in random_graph.edges if
random_graph.nodes[edge[0]]['cell_type'] !=
random_graph.nodes[edge[1]]['cell_type']]
# calculate the cluster cooccurence
if len(random_edges) > 0:
cluster_cooccurence = len(edges) / len(random_edges)
else:
cluster_cooccurence = np.nan
#Add to results
results['between_all_ccr'].append(cluster_cooccurence)
else:
results['between_all_ccr'].append(np.nan)
if cell_type_1 in unique_cells:
# count within group connections of cell type 1
within_group_1 = [edge for edge in G.edges if
G.nodes[edge[0]]['cell_type'] == cell_type_1 and
G.nodes[edge[1]]['cell_type'] == cell_type_1]
# count within group connections of cell type 1 in random graph
random_within_group_1 = [edge for edge in random_graph.edges if
random_graph.nodes[edge[0]]['cell_type'] ==
cell_type_1 and
random_graph.nodes[edge[1]]['cell_type'] ==
cell_type_1]
# calculate the cluster cooccurence for within group connections of
# cell type 1
if len(random_within_group_1) > 0:
within_group_1_cooccurence = len(within_group_1) / \
len(random_within_group_1)
else:
within_group_1_cooccurence = np.nan
#add to results
results['within_1_ccr'].append(within_group_1_cooccurence)
else:
results['within_1_ccr'].append(np.nan)
if cell_type_2 in unique_cells:
# count within group connections of cell type 2
within_group_2 = [edge for edge in G.edges if
G.nodes[edge[0]]['cell_type'] == cell_type_2 and
G.nodes[edge[1]]['cell_type'] == cell_type_2]
# count within group connections of cell type 2 in random graph
random_within_group_2 = [edge for edge in random_graph.edges if
random_graph.nodes[edge[0]]['cell_type'] ==
cell_type_2 and
random_graph.nodes[edge[1]]['cell_type'] ==
cell_type_2]
# calculate the cluster cooccurence for within group connections of
# cell type 2
if len(random_within_group_2) > 0:
within_group_2_cooccurence = len(within_group_2) / \
len(random_within_group_2)
else:
within_group_2_cooccurence = np.nan
results['within_2_ccr'].append(within_group_2_cooccurence)
else:
results['within_2_ccr'].append(np.nan)
# print(df.iloc[1]['ROI'])
# print(results['within_1_ccr'])
# print(results['within_2_ccr'])
# get average ccr
# check conditions again before creating mean to avoid division by 0
if len(unique_cells) > 1:
results['between_all_ccr'] = np.nanmean(results['between_all_ccr'])
else:
results['between_all_ccr'] = np.nan
# ignore warnings when no connections are found
with warnings.catch_warnings():
warnings.simplefilter("ignore")
results['within_1_ccr'] = np.nanmean(results['within_1_ccr'])
results['within_2_ccr'] = np.nanmean(results['within_2_ccr'])
return results
def make_graph(df, critical_distance):
"""
Make a graph from a dataframe with x, y coordinates and cell type
:param df: dataframe with x, y coordinates and cell type (qupath output)
:param critical_distance: distance in pixels
:return: graph and weights
"""
# extract the spatial coordinates
coordinates = df.loc[:,'X':'Y']
# extract the cell types
cell_types = df.Class
# Creating a graph from coordinates
positions = coordinates.to_numpy()
# create a weights object
# catch user warning
with warnings.catch_warnings():
warnings.simplefilter("ignore")
w = weights.DistanceBand.from_array(positions,
threshold=critical_distance)
# create a networkx graph from the weights object
G = nx.from_numpy_array(w.full()[0])
# add the coordinates to the graph
for i, (x, y) in enumerate(positions):
G.nodes[i]['pos'] = (x, y)
# add the cell types to the graph
for i, cell_type in enumerate(cell_types):
G.nodes[i]['cell_type'] = cell_type
return G, w
def plot_graph(G, pair, results_dir, cell_type_filter, image_file,
critical_distance, prepend=''):
"""
This function plots the graph
:param G: graph
:param pair: cell type pair
:param results_dir: path to the results directory
:param cell_type_filter: cell type filter
:param image_file: path to the image file
:param critical_distance: critical distance
:param prepend: string to prepend to the filename
:return: None
"""
#some issues still remain with this function:
# - it uses too much RAM for TMAs - resizing, tiling, individual core cropping or use of single channels may fix it
# - file is not found when file name and image name do not match. Find a way to adapt this
# - the file path may potentially be a problem still after the above matching. Remains to be seen.
# get image
my_img = plt.imread(image_file)
# we don't want colons in file names
nice = pair.replace(':', '_')
out_file = f'{nice}_{prepend}image_{image}_distance_{critical_distance}.png'
cell_types = [cell_type for node, cell_type in G.nodes(data='cell_type')]
# plot the graph
fig = plt.figure(figsize=(10, 10))
y_lim, x_lim = my_img.shape[0], my_img.shape[1]
extent = 0, x_lim, 0, y_lim
# transpose image
my_img = np.flipud(my_img)
# draw the image
plt.imshow(my_img, cmap='gray', extent=extent, interpolation='nearest')
# cell type name and colors
c1 = f"{cell_type_filter[0]} (lime)"
c2 = f"{cell_type_filter[1]} (magenta)"
color_map = ["lime" if cell_type == cell_type_filter[0] else "magenta" for
cell_type in
cell_types]
# draw graph on image
nx.draw_networkx(G, pos=nx.get_node_attributes(G, 'pos'),
node_color=color_map, node_size=10, alpha=0.5,
edge_color='yellow', width=0.5, with_labels=False)
plt.title(f'{c1} {c2} image {image} distance {critical_distance}px')
plt.savefig(f'{results_dir}/{out_file}')
plt.close(fig)
def calculate_statistics(df, G, w, cell_type_filter, critical_distance):
"""
This function calculates the statistics
:param df: dataframe with x, y coordinates and cell type (qupath output)
:param G: graph
:param w: weights object
:param cell_type_filter: cell type filter
:param critical_distance: critical distance needed for ccr
:return: tuple of statistics
"""
# test that df contains class
assert 'Class' in df.columns, "df does not contain Class column"
if len(df['Class'].unique()) > 1:
aac = nx.attribute_assortativity_coefficient(G, 'cell_type')
else:
aac = np.nan
# count nr of cells in each class
n_cells_class_1 = df.loc[
df['Class'] == cell_type_filter[0], 'Class'].count()
n_cells_class_2 = df.loc[
df['Class'] == cell_type_filter[1], 'Class'].count()
# count number of islands
n_islands = len(w.islands)
# get all nodes belonging to class 1
class_1_nodes = [node for node, cell_type in G.nodes(data='cell_type') if
cell_type == cell_type_filter[0]]
# calculate group degree centrality for class 1
try:
centrality_measures = nx.group_degree_centrality(G, class_1_nodes)
except Exception as e:
centrality_measures = 'NA'
# ratio is the proportion of class 1 cells in the network
ratio = n_cells_class_1 / (n_cells_class_2 + n_cells_class_1)
ccr = cluster_cooccurrence(df, G, critical_distance, cell_type_filter)
ccr_between_1_2 = ccr['between_all_ccr']
ccr_within_1 = ccr['within_1_ccr']
ccr_within_2 = ccr['within_2_ccr']
results = [aac, n_cells_class_1, n_cells_class_2,
n_islands, centrality_measures, ratio, ccr_between_1_2,
ccr_within_1, ccr_within_2]
return results
def network_plot(df, image, tiff_dir, critical_distance, results_dir,
prepend='',
pair='Elastas:CD163'):
"""
inner main function for iterating image by image
"""
cell_type_filter = pair.split(':')
# class_1 = df.groupby('Class').get_group(cell_type_filter[0])
# class_2 = df.groupby('Class').get_group(cell_type_filter[1])
# get image file path
image_file = tiff_dir + "\\" + image
# create results list
results = [image, cell_type_filter[0], cell_type_filter[1]]
# make graph
G, w = make_graph(df, critical_distance)
# calculate statistics
results.extend(calculate_statistics(df, G, w,
cell_type_filter, critical_distance))
results.append(df.iloc[1]['ROI'])
# Plotting the graph (THIS FUNCTION IS COMMENTED AWAY BECAUSE THE GRAPH BUGS WERE NOT SOLVED. IF A GRAPH IS TO BE CREATED OUTSIDE OF THIS SCRIPT, CREATE ONE IN QUPATH WITH THE SAME CUTOFF DISTANCE.)
# plot_graph(G, pair, results_dir, cell_type_filter, image_file,
# critical_distance, prepend=prepend)
return results
if __name__ == "__main__":
args = parser.parse_args()
# create the output dictionary
out = {'image':[],
'cell_type_1':[],
'cell_type_2':[],
'aac':[],
'n_cells_class_1':[],
'n_cells_class_2':[],
'n_islands':[],
'centrality_measures':[],
'ratio':[],
'ccr_between_1_2':[],
'ccr_within_1':[],
'ccr_within_2':[],
'ROI':[]}
# read the csv file
df = pd.read_csv(args.file,
sep=args.sep,
decimal=args.decimal,
low_memory=False,
skiprows=1,
usecols=[0, 1, 2, 3, 4, 5],
names=['Image',
'Class',
'Name',
'ROI', 'X', 'Y']).dropna(axis=1)
# filter on cell type included in analysis AND clean out any annotations & detections (outside of ROIs) with the parent named "Image"
filtered = df[((df['Class'] == args.pair.split(':')[0]) | (
df['Class'] == args.pair.split(':')[1])) & (df['ROI'] != 'Image')].reset_index(drop=True)
# list all images in the filtered dataset
images = sorted(filtered['Image'].unique())
# check that all images have both classes
for image in images:
if args.pair.split(':')[0] not in filtered.groupby('Image').get_group(
image)['Class'].unique():
print(f"{args.pair.split(':')[0]} not in {image}")
if args.pair.split(':')[1] not in filtered.groupby('Image').get_group(
image)['Class'].unique():
print(f"{args.pair.split(':')[1]} not in {image}")
# scale the visual loading bar
progress = len(filtered['ROI'].unique())*2 #better scaling can be found here if ROI numbers are recurring in different images
# main iterative loop. Calculates metrics for each ROI in each image.
with alive_bar(progress) as bar:
# iterate over all images
for image in images:
# extract the image
one_pic = filtered.groupby('Image').get_group(image).reset_index(drop=True)
# iterate through all ROIs in the picture
for roi_value in sorted(one_pic['ROI'].unique()):
# pick out all rows with the right ROI number. roi_value MIGHT NEED TO BE CONVERTED TO STRING OR INTEGER IN THE COMPARISON BELOW
one_roi = one_pic.loc[one_pic['ROI'] == roi_value].reset_index(drop=True)
try:
# run the network plot
results = network_plot(one_roi, image, args.tiff_dir,
args.critical_distance, args.results_dir,
pair=args.pair)
except Exception as e:
bar.text(f'Error in {image}: {e}')
continue
bar()
# append to output dictionary
for key, value in zip(out.keys(), results):
out[key].append(value)
bar()
# create dataframe from out dictionary
out_df = pd.DataFrame(out)
print(out_df.head())
# save the output
nice = args.pair.replace(':', '_')
out_df.to_csv(
f'{args.results_dir}/results_{nice}_distance_{args.critical_distance}px.csv',
index=False,
sep='\t')