-
Notifications
You must be signed in to change notification settings - Fork 1
/
contraction_bk.h
executable file
·192 lines (139 loc) · 6.81 KB
/
contraction_bk.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#ifndef _CONTRACTION_H_
#define _CONTRACTION_H_
#include <stdlib.h>
#include <mpi.h>
#include <list>
#include <string>
#include <algorithm>
#include <map>
#include "tensor.h"
#include "helper.h"
#include "redistribute.h"
#include "localcontract.h"
#include "omp.h"
#define EXT_A 1
#define EXT_B 2
#define CONTR_AB 0
// Ideally extend this class and make different contractions from it
class Contraction
{
private:
int rank, num_procs;
int *my_address;
Tensor *A, *B, *C;
int dims_A, dims_B, dims_C;
//permutation map for A, B and C before performing local dgemm
//C also needs an inverse permutation map after the dgemm completess
int *p_map_A, *p_map_B, *p_map_C, *inv_p_map_C;
//stores the block range of C after it has been transposed
//this pock range is used for inverse transposing C back to
//its original layout
int* inverse_block_range_C;
//stores if a given index of C is external index of A or B
int* is_C_indx_ext_A_or_ext_B;
//parameters for dgemm
int n_a, n_b, n_k;
//dgemm flags for A, B and C
int t_A, t_B;
MPI_Request *send_req_addr, *send_req_data;
MPI_Request *recv_req;
// This mapping is used to find tile address of the output tensor
int* A_to_C_map;
int* B_to_C_map;
int num_senders;
int num_receivers;
int* current_senders;
int* current_receivers;
int grid_dims;
int *pgrid;
int serial;
int* self_adr_sizes;
double** self_buffers;
int** self_addresses;
int* self_data_sizes;
int num_self_sends;
int check_A;
int check_B;
int num_blks_A;
int num_blks_B;
// Performance metrics
int local_num_dgemm, total_num_dgemm;
double dgemm_time;
double redist_time, comm_time, tr_time, comp_time;
double instigation_time, forward_receive_time;
double block_size_forward_time, address_forward_time, data_forward_time;
double timer1, timer2, timer3, timer4, timer5, timer6;
double m_timer1, m_timer2, m_timer3, m_timer4, m_timer5, m_timer6;
double max_block_size_forward_time, max_address_forward_time, max_data_forward_time;
double max_redist_time, max_comm_time, max_tr_time, max_comp_time;
double max_instigation_time, max_forward_receive_time;
//returns a permutation map that combines the affect of
//applying map1 and map2 consecutively
void compose_p_maps(Tensor* &T, int* &map1, int* &map2, int* &composed_map);
//generates permutation maps based on symmetry for bounced blocks
void get_symm_permutation(Tensor* &T, int* &tile_address, int* &sym_permutation);
//generates permutation maps for A, B and C for doing transpose before local dgemm can be performed
void generate_permutation_map(Tensor* &A, Tensor* &B, Tensor* &C, std::list<pair<int,int>> &contr_list);
//gives the mapping of external indicies in the input to external
//indicies in the output. This mapping is used to find tile address
//of the output tensor. Also assigns the indicies of input and output
//as external index or internal index. Asigns the indicies of C
//as external index of A or external index of B
void fill_input_to_output_map(string* &input, int input_dims, string* &output, int output_dims, int* &input_to_output_map, int ext_input);
//stores the address of C corresponding to address of A and B in C_address
void get_C_address(int* &A_address,int dims_a, int* &B_address, int dims_b, int* &C_address, int dims_c);
// Extracts blocks from the serialized input tensor that are required
// for computing blocks in C along a dimension that is serialized in the
// input tensor but distributed in C
int get_blks_from_serialized(int num_dim, string* &ext_dim, int* &ext_grid_dim, Tensor* &T, double* &blocks, int* &addresses);
// Finds out blocks required right now for instigation from a serialized tensor
int get_deserialized_blocks(Tensor* &T, double* &blocks, int* &addresses, int* &des_dims, int* &des_grid_dims, int &num_dims);
// Deserializes a tensor
int deserialize(Tensor* &T, double* &blocks, int* &addresses, int num_dim, int* &ext_T, int* &ext_grid_dim, std::list<int>* &sym_list);
// Deserialize the 2D input tensor T if the other input tensor S is 4D and the output is 2D
Tensor* deserialize_422(Tensor* &T, Tensor* &S);
// Asserts validity of specified contraction
void assert_contr_validity(Tensor* &A, Tensor* &B);
// Send data to instigator
void send_to_instigator( Tensor* &X, int contr_dim, int contr_idx, int &count_addr_sends, int &count_data_sends);
// Receive data at instigator
int recv_at_instigator(Tensor* &X, int contr_dim, int contr_idx, double* &blocks, int* &block_addrs, int send_addr_count, int send_data_count);
// Test for completion of bounce send receives of this processor
void test_comm_completion();
//waits until message sizes of all the senders have been recieved
void initial_wait(int num_senders, MPI_Request* &recv_req, int* senders);
// Merges multiple block sets into one removing duplicates
// Returns number of block copied
int merge_self_send_blocks(Tensor* &T, double* &out_blocks, int* &out_addr);
void forward_int(Tensor* A, int* &data, int size, int k, int cdim, bool is_instigator);
void forward_double(Tensor* A, double* &data, int size, int k, int cdim, bool is_instigator);
// Returns the rank of the previous processor in given dimension
int prev_processor(int dim);
// Returns the rank of the next processor in given dimension
int next_processor(int dim);
// Parses given contraction string and return the number of dimensions
int parse_contr_str(std::string contr_str, std::string* &out);
// Check if redistribution is required for an input tensor
bool check_redistr(Tensor* &T, Tensor* &C, int* &new_idmap, std::string* &t, std::string* &c);
// Performs recursive SUMMA
void rec_summa(Tensor* &A, Tensor* &B, Tensor* &C, std::list<std::pair<int,int>> contr_list,
std::pair<int,int> prev_cdim1, std::pair<int,int> prev_cdim2);
void temp_rec_summa(Tensor* &A, Tensor* &B, Tensor* &C, list<pair<int,int>> contr_list,
pair<int,int> prev_cdim1, pair<int,int> prev_cdim2);
// Perform transpose as required on input tensor blocks and call dgemm
void transpose_and_dgemm(int num_blocks_A, int num_blocks_B, double* &blocks_A, double* &blocks_B, int* &block_addr_A, int* &block_addr_B, int k);
// Perform 2D matrix multiplication
void kevin_dgemm(int n_a, int n_b, int n_k, double* &A, double* &B, double* &C, int at, int bt, double alpha);
// Print information about the performance of this contraction
void print_time_flops();
public:
// Constructor
Contraction(Tensor* &a, Tensor* &b, Tensor* &c, int grid_dimensions, int* &phy_grid);
// Destructor
~Contraction();
// Processors send to instigator and instigator collects the data
int instigate_collection(Tensor* &X, int contr_dim, int contr_idx, double* &blocks, int* &clocks_addrs);
// Contraction procedure
void contract(std::string contr_str_A, std::string contr_str_B, std::string contr_str_C);
};
#endif