#!/usr/bin/env python
u"""
nsidc_icesat2_sync.py
Written by Tyler Sutterley (08/2020)

Program to acquire ICESat-2 datafiles from NSIDC server:
https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+Python
https://nsidc.org/support/faq/what-options-are-available-bulk-downloading-data-
    https-earthdata-login-enabled
http://www.voidspace.org.uk/python/articles/authentication.shtml#base64

Register with NASA Earthdata Login system:
https://urs.earthdata.nasa.gov

Add NSIDC_DATAPOOL_OPS to NASA Earthdata Applications
https://urs.earthdata.nasa.gov/oauth/authorize?client_id=_JLuwMHxb2xX6NwYTb4dRA

CALLING SEQUENCE:
    python nsidc_icesat2_sync.py --user=<username> --release=001 ATL06
    where <username> is your NASA Earthdata username

INPUTS:
    ATL03: Global Geolocated Photon Data
    ATL04: Normalized Relative Backscatter
    ATL06: Land Ice Height
    ATL07: Sea Ice Height
    ATL08: Land and Vegetation Height
    ATL09: Atmospheric Layer Characteristics
    ATL10: Sea Ice Freeboard
    ATL12: Ocean Surface Height
    ATL13: Inland Water Surface Height

COMMAND LINE OPTIONS:
    --help: list the command line options
    -U X, --user=X: username for NASA Earthdata Login
    -N X, --netrc=X: path to .netrc file for alternative authentication
    -D X, --directory: working data directory
    -Y X, --year=X: years to sync separated by commas
    -S X, --subdirectory=X: subdirectories to sync separated by commas
    --release=X: ICESat-2 data release to sync
    --version=X: ICESat-2 data version to sync
    --track=X: ICESat-2 reference ground tracks to sync
    --granule=X: ICESat-2 granule regions to sync
    --auxiliary: Sync ICESat-2 auxiliary files for each HDF5 file
    -I X, --index=X: Input index of ICESat-2 files to sync
    -F, --flatten: Do not create subdirectories
    -P X, --np=X: Number of processes to use in file downloads
    -M X, --mode=X: Local permissions mode of the directories and files synced
    -l, --log: output log of files downloaded
    -L, --list: print files to be transferred, but do not execute transfer
    -C, --clobber: Overwrite existing data in transfer

PYTHON DEPENDENCIES:
    numpy: Scientific Computing Tools For Python
        https://numpy.org
        https://numpy.org/doc/stable/user/numpy-for-matlab-users.html
    lxml: Pythonic XML and HTML processing library using libxml2/libxslt
        https://lxml.de/
        https://github.com/lxml/lxml

PROGRAM DEPENDENCIES:
    utilities: download and management utilities for syncing files

UPDATE HISTORY:
    Updated 08/2020: moved urllib opener to function. add credential check
    Updated 07/2020: added option index to use a list of files to sync
    Updated 06/2020: added multiprocessing option for parallel download
    Updated 05/2020: added option netrc to use alternative authentication
        adjust regular expression to allow syncing of ATL07 sea ice products
        adjust regular expression for auxiliary products
    Updated 03/2020: added option flatten to not create subdirectories
    Updated 09/2019: added ssl context to urlopen headers
    Updated 07/2019: added options to sync specific granules, tracks and version
    Updated 06/2019: use strptime to extract last modified time of remote files
    Written 01/2019
"""
from __future__ import print_function

import sys
import os
import re
import ssl
import netrc
import getopt
import shutil
import base64
import getpass
import builtins
import posixpath
import traceback
import lxml.etree
import numpy as np
import calendar, time
import multiprocessing as mp
if sys.version_info[0] == 2:
    from cookielib import CookieJar
    import urllib2
else:
    from http.cookiejar import CookieJar
    import urllib.request as urllib2

#-- PURPOSE: "login" to NASA Earthdata with supplied credentials
def build_opener(username, password, urs='https://urs.earthdata.nasa.gov'):
    """
    build urllib opener for NASA Earthdata with supplied credentials

    Arguments
    ---------
    username: NASA Earthdata username
    password: NASA Earthdata password

    Keyword arguments
    -----------------
    urs: Earthdata login URS 3 host
    """
    #-- https://docs.python.org/3/howto/urllib2.html#id5
    #-- create a password manager
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    #-- Add the username and password for NASA Earthdata Login system
    password_mgr.add_password(None,urs,username,password)
    #-- Encode username/password for request authorization headers
    base64_string=base64.b64encode('{0}:{1}'.format(username,password).encode())
    #-- Create cookie jar for storing cookies. This is used to store and return
    #-- the session cookie given to use by the data server (otherwise will just
    #-- keep sending us back to Earthdata Login to authenticate).
    cookie_jar = CookieJar()
    #-- create "opener" (OpenerDirector instance)
    opener = urllib2.build_opener(
        urllib2.HTTPBasicAuthHandler(password_mgr),
        urllib2.HTTPSHandler(context=ssl.SSLContext()),
        urllib2.HTTPCookieProcessor(cookie_jar))
    #-- add Authorization header to opener
    authorization_header = "Basic {0}".format(base64_string.decode())
    opener.addheaders = [("Authorization", authorization_header)]
    #-- Now all calls to urllib2.urlopen use our opener.
    urllib2.install_opener(opener)
    #-- All calls to urllib2.urlopen will now use handler
    #-- Make sure not to include the protocol in with the URL, or
    #-- HTTPPasswordMgrWithDefaultRealm will be confused.

#-- PURPOSE: check that entered NASA Earthdata credentials are valid
def check_credentials():
    """
    Check that entered NASA Earthdata credentials are valid
    """
    try:
        remote_path = posixpath.join('https://n5eil01u.ecs.nsidc.org','ATLAS')
        request = urllib2.Request(url=remote_path)
        response = urllib2.urlopen(request, timeout=20)
    except urllib2.HTTPError:
        raise RuntimeError('Check your NASA Earthdata credentials')
    except urllib2.URLError:
        raise RuntimeError('Check internet connection')
    else:
        return True

#-- PURPOSE: sync the ICESat-2 elevation data from NSIDC
def nsidc_icesat2_sync(ddir, PRODUCTS, RELEASE, VERSIONS, GRANULES, TRACKS,
    YEARS=None, SUBDIRECTORY=None, AUXILIARY=False, INDEX=None, FLATTEN=False,
    LOG=False, LIST=False, PROCESSES=0, MODE=None, CLOBBER=False):

    #-- check if directory exists and recursively create if not
    os.makedirs(ddir,MODE) if not os.path.exists(ddir) else None

    #-- output of synchronized files
    if LOG:
        #-- format: NSIDC_IceBridge_sync_2002-04-01.log
        today = time.strftime('%Y-%m-%d',time.localtime())
        LOGFILE = 'NSIDC_IceSat-2_sync_{0}.log'.format(today)
        fid = open(os.path.join(ddir,LOGFILE),'w')
        print('ICESat-2 Data Sync Log ({0})'.format(today), file=fid)
    else:
        #-- standard output (terminal output)
        fid = sys.stdout

    #-- compile HTML parser for lxml
    parser = lxml.etree.HTMLParser()

    #-- remote https server for ICESat-2 Data
    HOST = 'https://n5eil01u.ecs.nsidc.org'
    #-- regular expression operator for finding files of a particular granule
    #-- find ICESat-2 HDF5 files in the subdirectory for product and release
    regex_track = '|'.join(['{0:04d}'.format(T) for T in TRACKS])
    regex_granule = '|'.join(['{0:02d}'.format(G) for G in GRANULES])
    regex_version = '|'.join(['{0:02d}'.format(V) for V in VERSIONS])
    regex_suffix = '(.*?)' if AUXILIARY else '(h5)'
    remote_regex_pattern=(r'{0}(-\d{{2}})?_(\d{{4}})(\d{{2}})(\d{{2}})(\d{{2}})'
        r'(\d{{2}})(\d{{2}})_({1})(\d{{2}})({2})_({3})_({4})(.*?).{5}$')

    #-- regular expression operator for finding subdirectories
    if SUBDIRECTORY:
        #-- Sync particular subdirectories for product
        R2 = re.compile(r'('+'|'.join(SUBDIRECTORY)+')', re.VERBOSE)
    elif YEARS:
        #-- Sync particular years for product
        regex_pattern = '|'.join('{0:d}'.format(y) for y in YEARS)
        R2 = re.compile(r'({0}).(\d+).(\d+)'.format(regex_pattern), re.VERBOSE)
    else:
        #-- Sync all available subdirectories for product
        R2 = re.compile(r'(\d+).(\d+).(\d+)', re.VERBOSE)

    #-- build list of remote files, remote modification times and local files
    remote_files = []
    remote_mtimes = []
    local_files = []
    #-- build lists of files or use existing index file
    if INDEX:
        #-- read the index file, split at lines and remove all commented lines
        with open(os.path.expanduser(INDEX),'r') as f:
            files = [i for i in f.read().splitlines() if re.match('^(?!#)',i)]
        #-- regular expression operator for extracting information from files
        rx = re.compile(r'(ATL\d{2})(-\d{2})?_(\d{4})(\d{2})(\d{2})(\d{2})'
            r'(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$')
        #-- for each line in the index
        for f in files:
            #-- extract parameters from ICESat-2 ATLAS HDF5 file
            PRD,HEM,YY,MM,DD,HH,MN,SS,TRK,CYC,GRN,RL,VRS,AUX=rx.findall(f).pop()
            #-- get directories from remote directory (* splat operator)
            sd = ['{0}.{1}'.format(PRD,RL),'{0}.{1}.{2}'.format(YY,MM,DD)]
            remote_dir = posixpath.join(HOST,'ATLAS',*sd)
            #-- local directory for product and subdirectory
            if FLATTEN:
                local_dir = os.path.expanduser(ddir)
            else:
                local_dir = os.path.join(ddir,*sd)
            #-- check if data directory exists and recursively create if not
            os.makedirs(local_dir,MODE) if not os.path.exists(local_dir) else None
            #-- find ICESat-2 data file to get last modified time
            req=urllib2.Request(url=remote_dir)
            #-- read and parse request for remote files (columns and dates)
            tree = lxml.etree.parse(urllib2.urlopen(req), parser)
            colnames = tree.xpath('//td[@class="indexcolname"]//a/@href')
            collastmod = tree.xpath('//td[@class="indexcollastmod"]/text()')
            #-- find matching files (for granule, release, version, track)
            remote_file_line=[i for i,n in enumerate(colnames) if (n==f.strip())]
            #-- print if file was not found
            if not remote_file_line:
                print('{0} not found on {1}'.format(f,remote_dir),file=fid)
            #-- add to lists
            for i in remote_file_line:
                #-- remote and local versions of the file
                remote_files.append(posixpath.join(remote_dir,colnames[i]))
                local_files.append(os.path.join(local_dir,colnames[i]))
                #-- get last modified date and convert into unix time
                LMD = time.strptime(collastmod[i].rstrip(),'%Y-%m-%d %H:%M')
                remote_mtimes.append(calendar.timegm(LMD))
    else:
        #-- for each ICESat-2 product listed
        for p in PRODUCTS:
            print('PRODUCT={0}'.format(p), file=fid)
            #-- get directories from remote directory (* splat operator)
            product_directory = '{0}.{1}'.format(p,RELEASE)
            d = posixpath.join(HOST,'ATLAS',product_directory)
            req = urllib2.Request(url=d)
            #-- compile regular expression operator
            args=(p,regex_track,regex_granule,RELEASE,regex_version,regex_suffix)
            R1 = re.compile(remote_regex_pattern.format(*args), re.VERBOSE)
            #-- read and parse request for subdirectories (find column names)
            tree = lxml.etree.parse(urllib2.urlopen(req), parser)
            colnames = tree.xpath('//td[@class="indexcolname"]//a/@href')
            remote_sub = [sd for sd in colnames if R2.match(sd)]
            #-- for each remote subdirectory
            for sd in remote_sub:
                #-- local directory for product and subdirectory
                if FLATTEN:
                    local_dir = os.path.expanduser(ddir)
                else:
                    local_dir = os.path.join(ddir,product_directory,sd)
                #-- check if data directory exists and recursively create if not
                os.makedirs(local_dir,MODE) if not os.path.exists(local_dir) else None
                #-- find ICESat-2 data files
                req=urllib2.Request(url=posixpath.join(d,sd))
                #-- read and parse request for remote files (columns and dates)
                tree = lxml.etree.parse(urllib2.urlopen(req), parser)
                colnames = tree.xpath('//td[@class="indexcolname"]//a/@href')
                collastmod = tree.xpath('//td[@class="indexcollastmod"]/text()')
                #-- find matching files (for granule, release, version, track)
                remote_file_lines=[i for i,f in enumerate(colnames) if R1.match(f)]
                #-- build lists of each ICESat-2 data file
                for i in remote_file_lines:
                    #-- remote and local versions of the file
                    remote_files.append(posixpath.join(d,sd,colnames[i]))
                    local_files.append(os.path.join(local_dir,colnames[i]))
                    #-- get last modified date and convert into unix time
                    LMD = time.strptime(collastmod[i].rstrip(),'%Y-%m-%d %H:%M')
                    remote_mtimes.append(calendar.timegm(LMD))
            #-- close request
            req = None

    #-- sync in series if PROCESSES = 0
    if (PROCESSES == 0):
        #-- sync each ICESat-2 data file
        for i,remote_file in enumerate(remote_files):
            #-- sync ICESat-2 files with NSIDC server
            output = http_pull_file(remote_file, remote_mtimes[i],
                local_files[i], LIST, CLOBBER, MODE)
            #-- print the output string
            print(output, file=fid)
    else:
        #-- sync in parallel with multiprocessing Pool
        pool = mp.Pool(processes=PROCESSES)
        #-- sync each ICESat-2 data file
        output = []
        for i,remote_file in enumerate(remote_files):
            #-- sync ICESat-2 files with NSIDC server
            args = (remote_file,remote_mtimes[i],local_files[i])
            kwds = dict(LIST=LIST, CLOBBER=CLOBBER, MODE=MODE)
            output.append(pool.apply_async(multiprocess_sync,
                args=args,kwds=kwds))
        #-- start multiprocessing jobs
        #-- close the pool
        #-- prevents more tasks from being submitted to the pool
        pool.close()
        #-- exit the completed processes
        pool.join()
        #-- print the output string
        for out in output:
            print(out.get(), file=fid)

    #-- close log file and set permissions level to MODE
    if LOG:
        fid.close()
        os.chmod(os.path.join(ddir,LOGFILE), MODE)

#-- PURPOSE: wrapper for running the sync program in multiprocessing mode
def multiprocess_sync(remote_file, remote_mtime, local_file,
    LIST=False, CLOBBER=False, MODE=0o775):
    try:
        output = http_pull_file(remote_file,remote_mtime,local_file,
            LIST,CLOBBER,MODE)
    except:
        #-- if there has been an error exception
        #-- print the type, value, and stack trace of the
        #-- current exception being handled
        print('process id {0:d} failed'.format(os.getpid()))
        traceback.print_exc()
    else:
        return output

#-- PURPOSE: pull file from a remote host checking if file exists locally
#-- and if the remote file is newer than the local file
def http_pull_file(remote_file,remote_mtime,local_file,LIST,CLOBBER,MODE):
    #-- if file exists in file system: check if remote file is newer
    TEST = False
    OVERWRITE = ' (clobber)'
    #-- check if local version of file exists
    if os.access(local_file, os.F_OK):
        #-- check last modification time of local file
        local_mtime = os.stat(local_file).st_mtime
        #-- if remote file is newer: overwrite the local file
        if (remote_mtime > local_mtime):
            TEST = True
            OVERWRITE = ' (overwrite)'
    else:
        TEST = True
        OVERWRITE = ' (new)'
    #-- if file does not exist locally, is to be overwritten, or CLOBBER is set
    if TEST or CLOBBER:
        #-- output string for printing files transferred
        output = '{0} -->\n\t{1}{2}\n'.format(remote_file,local_file,OVERWRITE)
        #-- if executing copy command (not only printing the files)
        if not LIST:
            #-- Create and submit request. There are a wide range of exceptions
            #-- that can be thrown here, including HTTPError and URLError.
            request = urllib2.Request(remote_file)
            response = urllib2.urlopen(request)
            #-- chunked transfer encoding size
            CHUNK = 16 * 1024
            #-- copy contents to local file using chunked transfer encoding
            #-- transfer should work properly with ascii and binary data formats
            with open(local_file, 'wb') as f:
                shutil.copyfileobj(response, f, CHUNK)
            #-- keep remote modification time of file and local access time
            os.utime(local_file, (os.stat(local_file).st_atime, remote_mtime))
            os.chmod(local_file, MODE)
        #-- return the output string
        return output

#-- PURPOSE: help module to describe the optional input parameters
def usage():
    print('\nHelp: {0}'.format(os.path.basename(sys.argv[0])))
    print(' -U X, --user=X\t\tUsername for NASA Earthdata Login')
    print(' -N X, --netrc=X\tPath to .netrc file for authentication')
    print(' -D X, --directory=X\tWorking data directory')
    print(' -Y X, --year=X\t\tYears to sync separated by commas')
    print(' -S X, --subdirectory=X\tSubdirectories to sync separated by commas')
    print(' --release=X\t\tICESat-2 data release to sync')
    print(' --version=X\t\tICESat-2 data version to sync')
    print(' --granule=X\t\tICESat-2 granule regions to sync')
    print(' --track=X\t\tICESat-2 reference ground tracks to sync')
    print(' --auxiliary\t\tSync ICESat-2 auxiliary files for each HDF5 file')
    print(' -I X, --index=X\t\tInput index of ICESat-2 files to sync')
    print(' -F, --flatten\t\tDo not create subdirectories')
    print(' -P X, --np=X\t\tNumber of processes to use in file downloads')
    print(' -M X, --mode=X\t\tPermission mode of directories and files synced')
    print(' -L, --list\t\tOnly print files that are to be transferred')
    print(' -C, --clobber\t\tOverwrite existing data in transfer')
    print(' -l, --log\t\tOutput log file')
    today = time.strftime('%Y-%m-%d',time.localtime())
    LOGFILE = 'NSIDC_IceSat-2_sync_{0}.log'.format(today)
    print('    Log file format: {0}\n'.format(LOGFILE))

#-- Main program that calls nsidc_icesat2_sync()
def main():
    #-- Read the system arguments listed after the program
    short_options = 'hU:N:D:Y:S:I:FP:LCM:l'
    long_options=['help','user=','netrc=','directory=','year=','subdirectory=',
        'release=','version=','granule=','track=','auxiliary','index=',
        'flatten','np=','list','log','mode=','clobber']
    optlist,arglist=getopt.getopt(sys.argv[1:],short_options,long_options)

    #-- command line parameters
    USER = ''
    NETRC = None
    #-- Working data directory
    DIRECTORY = os.getcwd()
    YEARS = None
    SUBDIRECTORY = None
    VERSIONS = np.arange(1,10)
    RELEASE = '003'
    GRANULES = np.arange(1,15)
    TRACKS = np.arange(1,1388)
    AUXILIARY = False
    INDEX = None
    FLATTEN = False
    #-- sync in series if processes is 0
    PROCESSES = 0
    LIST = False
    LOG = False
    #-- permissions mode of the local directories and files (number in octal)
    MODE = 0o775
    CLOBBER = False
    for opt, arg in optlist:
        if opt in ('-h','--help'):
            usage()
            sys.exit()
        elif opt in ("-Y","--year"):
            YEARS = np.array(arg.split(','), dtype=np.int)
        elif opt in ("-S","--subdirectory"):
            SUBDIRECTORY = arg.split(',')
        elif opt in ("-U","--user"):
            USER = arg
        elif opt in ("-N","--netrc"):
            NETRC = os.path.expanduser(arg)
        elif opt in ("-D","--directory"):
            DIRECTORY = os.path.expanduser(arg)
        elif opt in ("--release",):
            RELEASE = '{0:03d}'.format(int(arg))
        elif opt in ("--version",):
            VERSIONS = np.array(arg.split(','), dtype=np.int)
        elif opt in ("--granule",):
            GRANULES = np.array(arg.split(','), dtype=np.int)
        elif opt in ("--track",):
            TRACKS = np.sort(arg.split(',')).astype(np.int)
        elif opt in ("--auxiliary",):
            AUXILIARY = True
        elif opt in ("-I","--index"):
            INDEX = os.path.expanduser(arg)
        elif opt in ("-P","--np"):
            PROCESSES = int(arg)
        elif opt in ("-F","--flatten"):
            FLATTEN = True
        elif opt in ("-L","--list"):
            LIST = True
        elif opt in ("-l","--log"):
            LOG = True
        elif opt in ("-M","--mode"):
            MODE = int(arg, 8)
        elif opt in ("-C","--clobber"):
            CLOBBER = True

    #-- Pre-ICESat-2 and IceBridge Products
    PROD = {}
    PROD['ATL03'] = 'Global Geolocated Photon Data'
    PROD['ATL04'] = 'Normalized Relative Backscatter'
    PROD['ATL06'] = 'Land Ice Height'
    PROD['ATL07'] = 'Sea Ice Height'
    PROD['ATL08'] = 'Land and Vegetation Height'
    PROD['ATL09'] = 'Atmospheric Layer Characteristics'
    PROD['ATL10'] = 'Sea Ice Freeboard'
    PROD['ATL12'] = 'Ocean Surface Height'
    PROD['ATL13'] = 'Inland Water Surface Height'

    #-- enter dataset to transfer as system argument
    if not INDEX and not arglist:
        for key,val in PROD.items():
            print('{0}: {1}'.format(key, val))
        raise Exception('No System Arguments Listed')

    #-- check that each data product entered was correctly typed
    keys = ','.join(sorted([key for key in PROD.keys()]))
    for p in arglist:
        if p not in PROD.keys():
            raise IOError('Incorrect Data Product Entered ({0})'.format(keys))

    #-- NASA Earthdata hostname
    HOST = 'urs.earthdata.nasa.gov'
    #-- get authentication
    if not USER and not NETRC:
        #-- check that NASA Earthdata credentials were entered
        USER = builtins.input('Username for {0}: '.format(HOST))
        #-- enter password securely from command-line
        PASSWORD = getpass.getpass('Password for {0}@{1}: '.format(USER,HOST))
    elif NETRC:
        USER,LOGIN,PASSWORD = netrc.netrc(NETRC).authenticators(HOST)
    else:
        #-- enter password securely from command-line
        PASSWORD = getpass.getpass('Password for {0}@{1}: '.format(USER,HOST))

    #-- build a urllib opener for NSIDC
    #-- Add the username and password for NASA Earthdata Login system
    build_opener(USER,PASSWORD)

    #-- check internet connection before attempting to run program
    #-- check NASA earthdata credentials before attempting to run program
    if check_credentials():
        nsidc_icesat2_sync(DIRECTORY, arglist, RELEASE, VERSIONS, GRANULES,
            TRACKS, YEARS=YEARS, SUBDIRECTORY=SUBDIRECTORY, AUXILIARY=AUXILIARY,
            INDEX=INDEX, FLATTEN=FLATTEN, PROCESSES=PROCESSES, LOG=LOG,
            LIST=LIST, MODE=MODE, CLOBBER=CLOBBER)

#-- run main program
if __name__ == '__main__':
    main()