-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathglasCsvToGdb.py
373 lines (264 loc) · 13 KB
/
glasCsvToGdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 5 01:05:33 2020
@author: mwooten3
See 3DSI/GLAS_gdb_notes.txt
Purpose: To build a GDB of buffered GLAS shots from csv(s)
Process:
Iterate through big boreal GLAS .csv
For each row, convert lat/lon to OGR point object
Buffer the point object by X m and save to output polygon .gdb
Write attributes from row to .gdb
Intermediate data: /att/gpfsfs/briskfs01/ppl/mwooten3/3DSI/GLAS/
Final .gdb will be moved to: /att/gpfsfs/briskfs01/ppl/mwooten3/3DSI/GLAS_naBoreal.gdb
NOTES:
Working with GPKG, because I think the FileGDB driver is finicky.
If input is .gdb, will write to .gpkg then convert to .gdb
Added overwrite step (7/30) so that we can pick up if we left off. Still testing though
"""
import os
import csv
import argparse
import datetime
import platform
#import pandas as pd
from collections import OrderedDict
from osgeo import ogr# gdal, ogr, osr
from FeatureClass import FeatureClass
from SpatialHelper import SpatialHelper
# Breaking up the creation into batches between the 10 crane nodes:
# node --> (x, y) where the y is not inclusive (skip when count = y)
countDict = {'crane101': '0,5000000',
'crane102': '5000000,10000000',
'crane103': '10000000,15000000',
'crane104': '15000000,20000000',
'crane105': '20000000,25000000',
'crane106': '25000000,30000000',
'crane107': '30000000,35000000',
'crane108': '35000000,40000000',
'crane109': '40000000,45000000',
'crane110': '45000000,50000000'
}
def addBufferToOutput(rowDict, layer, bufferDist):
# Add a lat/lon point with X m buffer + attributes to a layer
layerDefn = layer.GetLayerDefn()
layerSrs = layer.GetSpatialRef()
layerEpsg = layerSrs.GetAuthorityCode(None)
lat = float(rowDict['lat'])
lon = float(rowDict['lon'])
# Create point OGR point from lat/lon
pointGeom = ogr.Geometry(ogr.wkbPoint)
pointGeom.AddPoint(lon, lat)
# Convert point to meter proj
utmEpsg = SpatialHelper().determineUtmEpsgFromShape(pointGeom, layerEpsg)
pointProj = SpatialHelper().reprojectShape(pointGeom, layerEpsg, utmEpsg)
# Get buffered polygon geometry from projected point
polyProj = pointProj.Buffer(int(bufferDist))
# Then convert polygon back to lat/lon proj
polyBuffer = SpatialHelper().reprojectShape(polyProj, utmEpsg, layerEpsg)
# Create buffered polygon feature
polyFeature = ogr.Feature(layerDefn)
polyFeature.SetGeometry(polyBuffer)
# Add attributes to OGR columns for feature
for colName in rowDict.keys():
polyFeature.SetField(colName, rowDict[colName])
# Add feature with attributes to layer
layer.CreateFeature(polyFeature)
layer = polyFeature = pointGeom = None
return None
def addShotDate(rowDict):
# Get the shot day (julian) and year from the date column
# "date" column represents days since January 1, 2003
daysSinceStart = float(rowDict['date'])
startDate = datetime.datetime.strptime('2003-01-01', "%Y-%m-%d")
shotDate = startDate + datetime.timedelta(days=daysSinceStart)
rowDict['shotYear'] = shotDate.timetuple().tm_year
rowDict['shotMonth'] = shotDate.timetuple().tm_mon
rowDict['shotDay'] = shotDate.timetuple().tm_mday # day of the month
rowDict['shotJD'] = shotDate.timetuple().tm_yday # Julian day
return rowDict
def convertGpkgToGdb(gpkg, gdb):
# Call ogr2ogr and convert to .gdb
cmd = "ogr2ogr -f 'FileGDB' {} {}".format(gdb, gpkg)
print " {}".format(cmd)
os.system(cmd)
return gdb
def createFields(layer, columns):
# If column is string type (ogr.OFTString), it will be in this list
# all others should be float64 (ogr.OFTReal)
stringColumns = ['datatake']
intColumns = ['bufferSize', 'shotYear', 'shotMonth', 'shotDay', 'shotJD']
# Add unique ID separately so that it's first
layer.CreateField(ogr.FieldDefn('uniqueID', ogr.OFTInteger))
for col in columns:
# Determine column type (default is Float)
colType = ogr.OFTReal
if col in stringColumns:
colType = ogr.OFTString
if col in intColumns:
colType = ogr.OFTInteger
layer.CreateField(ogr.FieldDefn(col, colType))
return None
def rowDictToCsv(rowDict, writeCsv):
# Write header if .csv does not exist;
if not os.path.isfile(writeCsv):
with open(writeCsv, 'w') as wc:
wc.write('{}\n'.format(','.join(rowDict.keys())))
# Write row
with open(writeCsv, 'a') as wc:
wc.write('{}\n'.format(','.join(rowDict.values())))
return None
def filterRow(rowDict):
# Only use points if the following criteria is True
# 'FRir_qaFlag' == 15
# 'satNdx' < 2
# 'cld1_mswf' < 15
removePoint = True
if int(rowDict['FRir_qaFlag']) == 15 and int(rowDict['satNdx']) < 2 and \
int(rowDict['cld1_mswf']) < 15:
removePoint = False
return removePoint
def main(args):
startTime = datetime.datetime.now()
# Arguments: Input GLAS .csv; output directory; buffer distance (m)
inCsv = args['inputCsv']
outGdb = args['outputGdb']
bufferDist = args['bufferDistance']
overwrite = True # Set to False if we want to append to gdb/gpkg
node = platform.node()
"""
inCsv = '/att/gpfsfs/briskfs01/ppl/pmontesa/userfs02/data/glas/circ_boreal/gla01-boreal50up-fix2-data.csv'
outGdb = '/att/gpfsfs/briskfs01/ppl/mwooten3/3DSI/GLAS/GLAS_boreal.gdb'
bufferDist = 15
"""
# Get the header. Chaotic, but the column names for the fix2-data .csv are here
hdrFile = '/att/nobackup/mwooten3/3DSI/GLAS/GLAS_csv_columns.txt'
with open(hdrFile, 'r') as hf:
hdr = hf.readline().strip('\n').split(',')
# To save rows with bad lat/lon values to csv:
#badCoordCsv = '/att/gpfsfs/briskfs01/ppl/mwooten3/3DSI/GLAS/badCoords.csv'
# To save info about batches (already made header in file)
batchCsv = '/att/gpfsfs/briskfs01/ppl/mwooten3/3DSI/GLAS/batchTrack.csv'
# Hardcode the additional columns we want to add to .gdb
# Doing uniqueID separate because we want it to be first
extraColumns = ['shotYear', 'shotMonth', 'shotDay', 'shotJD', 'bufferSize']
# Get basename (aka eventual layer name) of output .gdb
ext = os.path.splitext(outGdb)[1]
bname = os.path.basename(outGdb).strip(ext)
# Get the output .gpkg because that's what we are writing to
if ext == '.gdb':
outGpkg = outGdb.replace('.gdb', '__{}.gpkg'.format(node))
convertToGdb = True
elif ext == '.gpkg':
outGpkg = outGdb.replace('.gpkg', '__{}.gpkg'.format(node))
convertToGdb = False
convertToGdb = False #TEMP - set to False until last batch
driver = ogr.GetDriverByName('GPKG')
# Create the output if overwrite is True or file does not exist
if overwrite or not os.path.isfile(outGpkg):
print "\nCreating {}...".format(outGpkg)
# Set up the output gdb
ds = driver.CreateDataSource(outGpkg)
srs = ogr.osr.SpatialReference()
srs.ImportFromEPSG(4326) # WGS84 Lat/Lon
# Create file with srs 4326; layerName = basename
layer = ds.CreateLayer(bname, srs, ogr.wkbPolygon)
# Create the fields in the output .gdb using hdr contents and addl columns
createFields(layer, hdr + extraColumns)
# Open the layer if not overwriting
else:
print "\nOpening and appending to {}...".format(outGpkg)
ds = driver.Open(outGpkg)
layer = ds.GetLayer()
# 8/14 Now that we are hardcoding batches for crane nodes, get that info
cntStart = int(countDict[node].split(',')[0])
cntEnd = int(countDict[node].split(',')[1])
# Iterate through rows in .csv and write buffered polygons to output
totCnt = 0 # count all of the points, not just the unfiltered ones
badCnt = 0 # count of points with missing columns
fltrCnt = 0 # count of points that were filtered
cnt = 0 # count of points added to gdb
badLon = [] # list of longitudes that are < or > 180
#import pdb; pdb.set_trace()
with open(inCsv, 'r') as c:
data = csv.reader(c)
for row in data:
totCnt += 1
# split into batch depending on crane node
if totCnt < cntStart or totCnt >= cntEnd:
continue
# Print count at every half millionth point (total not just unfiltered)
if totCnt % 50000 == 0: # count ever 5,000 row now
print " Total count so far = {}".format(totCnt)
print " (Added count = {})".format(cnt)
#layer.SyncToDisk() # Hopefully save info from memory to gpkg - dont think this worked
# Sometimes a row has fewer columns than the header. Skip those
if len(row) != len(hdr):
badCnt += 1
continue
# Create dict that corresponds to the hdr with contents in row:
rowDict = OrderedDict() # keep column order
for c in hdr: rowDict[c] = row[hdr.index(c)]
# Some rows have a too large/small longitude. Record info in list
if float(rowDict['lon']) > 180 or float(rowDict['lon']) < -180:
badLon.append('{}: {}'.format(rowDict['lon'], totCnt))
rowDict['totCnt'] = str(totCnt)
#rowDictToCsv(rowDict, badCoordCsv) # save to .csv
continue # and skip
# Check for lat too just in case
if float(rowDict['lat']) > 90 or float(rowDict['lat']) < -90:
badLon.append('{}: {}'.format(rowDict['lat'], totCnt))
rowDict['totCnt'] = str(totCnt)
#rowDictToCsv(rowDict, badCoordCsv)
continue # and skip
# Filter the row. Will return False if we want to keep the point
if filterRow(rowDict):
fltrCnt += 1
continue # if True, skip point
rowDict['uniqueID'] = cnt + 1 # Add unique ID to dict
rowDict['bufferSize'] = bufferDist
rowDict = addShotDate(rowDict)
# Now send the row to function to buffer and add to output
addBufferToOutput(rowDict, layer, bufferDist)
cnt += 1 # only add to count if point is added
nFeat1 = FeatureClass(outGpkg).nFeatures
# Try destroying the ds to free up memory and save:
ds.Destroy()
endTime1 = datetime.datetime.now()
elapsedTime1 = endTime1 - startTime
# Record info:
# node,start,end,totCnt,addCnt,nFeat,missingCols,badCoords,filtered, time (minutes)
rec = '{},{},{},{},{},{},{},{},{},{}\n'.format(node, cntStart, cntEnd, totCnt,
cnt, nFeat1, badCnt, len(badLon), fltrCnt,
*divmod(elapsedTime1.total_seconds(), 60))
with open(batchCsv, 'a') as bc:
bc.write(rec)
print "\nFinished creating .gpkg ({} features) in ~{} minutes". \
format(nFeat1, *divmod(elapsedTime1.total_seconds(), 60))
print "\nRow counts:"
print " Total rows = {}".format(totCnt)
print " Filtered rows = {}".format(fltrCnt)
print " Rows with missing columns = {}".format(badCnt)
print " Rows with longitude value too large = {}".format(len(badLon))
print " Added rows = {}".format(cnt)
if convertToGdb:
print "\nConverting output .gpkg to .gdb ({})".format(outGdb)
convertGpkgToGdb(outGpkg, outGdb)
nFeat2 = FeatureClass(outGdb).nFeatures
endTime2 = datetime.datetime.now()
elapsedTime2 = endTime2 - endTime1
print "\nFinished converting to .gdb ({} features) in ~{} minutes\n". \
format(nFeat2, *divmod(elapsedTime2.total_seconds(), 60))
return outGdb
return outGpkg
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Maybe this will be a raster not a shp. or a table with lat/lon
parser.add_argument("-i", "--inputCsv", type = str, required = True,
help = "Input GLAS .csv")
parser.add_argument("-o", "--outputGdb", type = str, required = True,
help = "Output GDB")
parser.add_argument("-b", "--bufferDistance", type = str, required = False,
default = 15, help = "Buffer radius in m; default = 15")
args = vars(parser.parse_args())
main(args)