Merge pull request #3 from soar-zhengjian/master

Add some operations for UAI Train job
ucloud · Nov 15, 2017 · cc0c3ac · cc0c3ac
2 parents 9d4e7a7 + a31c49e
commit cc0c3ac
Show file tree

Hide file tree

Showing 26 changed files with 769 additions and 90 deletions.
diff --git a/uai/utils/logger.py b/uai/utils/logger.py
@@ -72,3 +72,9 @@ def format_normal(self):
 
     def format_exception(self, *args):
         return ("%s%s: " + self.__message) % args % self.__args
+
+def printConsoleOnlyError():
+    global uai_logger
+    LOGGING['handlers']['console']['level'] = 'ERROR'
+    logging.config.dictConfig(LOGGING)
+    uai_logger = logging.getLogger("uaiservice")
diff --git a/uai/utils/utils.py b/uai/utils/utils.py
@@ -2,7 +2,6 @@
 import hashlib
 import tarfile
 import json
-
 GATEWAY_DEFAULT='Default'
 
 def _verfy_ac(private_key, params):
@@ -17,7 +16,6 @@ def _verfy_ac(private_key, params):
     sign = hashlib.sha1()
     sign.update(params_data.encode('utf-8')) # must encode to adapt python3
     signature = sign.hexdigest()
-    print("Signature",signature)
     return signature
 
 def val_to_str(val):

diff --git a/uaitrain/api/base_op.py b/uaitrain/api/base_op.py
@@ -65,7 +65,8 @@ def _cmd_common_request(self):
             self.cmd_params.pop('Signature')
         self.cmd_params['Signature'] = _verfy_ac(self.priv_key,
                                                  self.cmd_params)
-        print (self.cmd_params)
+        uai_logger.info("Signature: {0}".format(self.cmd_params['Signature']))
+        uai_logger.info(self.cmd_params)
         uai_logger.info("Call http request: {0} ".format(get_request(self.cmd_url, params=self.cmd_params)))
         r = requests.get(self.cmd_url, params=self.cmd_params)
         rsp = json.loads(r.text, 'utf-8')

diff --git a/uaitrain/api/get_train_job_predict_start_time.py b/uaitrain/api/get_train_job_predict_start_time.py
@@ -0,0 +1,51 @@
+# Copyright 2017 The UAI-SDK Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from uaitrain.api.base_op import BaseUAITrainAPIOp
+
+class GetUAITrainJobStartPredictOp(BaseUAITrainAPIOp):
+    ACTION_NAME = "GetUAITrainJobStartPredict"
+    """
+    GetUAITrainJobStartPredictOp
+        Compatable with UAI Train GetUAITrainJobStartPredict API func
+        Input:
+            pub_key             string(required) Public key of the user
+            priv_key            string(required) Private key of the user
+            project_id          int(optional)    Project ID of the job
+            region              string(optional) Which Region to run the job
+            zone                string(optional) Which Zone in the Region to run the job
+            job_id              string(required) Job id of the job
+            
+        Output:
+            RetCode       int(required)                Op return code: 0: success, others: error code
+            Action        string(required)             Action name
+            Message       string(not required)         Message: error description
+
+    """
+
+    def __init__(self, pub_key, priv_key, job_id, project_id="", region="", zone=""):
+        super(GetUAITrainJobStartPredictOp, self).__init__(self.ACTION_NAME,
+                                                     pub_key,
+                                                     priv_key,
+                                                     project_id,
+                                                     region,
+                                                     zone)
+        self.cmd_params["TrainJobId"] = job_id
+
+    def _check_args(self):
+        super(GetUAITrainJobStartPredictOp, self)._check_args()
+
+        if self.cmd_params["TrainJobId"] == "" or type(self.cmd_params["TrainJobId"] != str):
+            raise RuntimeError("job_id shoud be <str> and is not nil.")
diff --git a/uaitrain/api/get_train_job_running_log.py b/uaitrain/api/get_train_job_running_log.py
@@ -0,0 +1,51 @@
+# Copyright 2017 The UAI-SDK Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from uaitrain.api.base_op import BaseUAITrainAPIOp
+
+class GetUAITrainRunningLogOp(BaseUAITrainAPIOp):
+    ACTION_NAME = "GetUAITrainRunningLog"
+    """
+    GetUAITrainRunningLogOp
+        Compatable with UAI Train GetUAITrainRunningLog API func
+        Input:
+            pub_key             string(required) Public key of the user
+            priv_key            string(required) Private key of the user
+            project_id          int(optional)    Project ID of the job
+            region              string(optional) Which Region to run the job
+            zone                string(optional) Which Zone in the Region to run the job
+            job_id              string(required) Job id of the job
+
+        Output:
+            RetCode       int(required)                Op return code: 0: success, others: error code
+            Action        string(required)             Action name
+            Message       string(not required)         Message: error description
+            RunningLog    []string                     realtime log that train job produces
+    """
+
+    def __init__(self, pub_key, priv_key, job_id, project_id="", region="", zone=""):
+        super(GetUAITrainRunningLogOp, self).__init__(self.ACTION_NAME,
+                                                     pub_key,
+                                                     priv_key,
+                                                     project_id,
+                                                     region,
+                                                     zone)
+        self.cmd_params["TrainJobId"] = job_id
+
+    def _check_args(self):
+        super(GetUAITrainRunningLogOp, self)._check_args()
+
+        if type(self.cmd_params["TrainJobId"]) != str or self.cmd_params["TrainJobId"] == "":
+            raise RuntimeError("job_id shoud be str and is not nil.")
diff --git a/uaitrain/arch/pytorch/uargs.py b/uaitrain/arch/pytorch/uargs.py
@@ -1,66 +1,66 @@
-# Copyright 2017 The UAI-SDK Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-def add_uai_args(parser):
-    uai = parser.add_argument_group('uai-args', 'the UAI related args')
-
-    '''
-        Default work dir. The working dir for the traing job, it will contains:
-            /data/data     --data_dir
-            /data/output   --output_dir
-
-        Note: DO NOT CHANGE THIS VALUE
-            UCloud Train Job Executor Will Set it Automatically
-    '''
-    uai.add_argument('--work_dir', type=str, default="/data", help='Default work path')
-
-    '''
-        Default data path used in Training, all data will be downloaded into this path
-        Please use data in this path as input for Training
-
-        Note: DO NOT CHANGE THIS VALUE
-            UCloud Train Job Executor Will Set it Automatically
-    '''
-    uai.add_argument("--data_dir", type=str, default="/data/data", help="Default data path")
-
-    '''
-        Default output path used in Training, files in this path will be uploaded to UFile 
-        after training finished.
-        You can also assume your checkpoint files inside output_path (If you provided 
-        in the UCloud console), files will also be downloaded into this path befor 
-        Training start
-
-        Note: DO NOT CHANGE THIS VALUE
-            UCloud Train Job Executor Will Set it Automatically
-    '''
-    uai.add_argument("--output_dir", type=str, default="/data/output", help="Default output path")
-
-    '''
-        Default tensorboard output path used in Training, iles in this path will be uploaded to UFile 
-        after training finished.
-        This dir is same as output_dir
-
-        Note: DO NOT CHANGE THIS VALUE
-            UCloud Train Job Executor Will Set it Automatically
-    '''
-    uai.add_argument("--log_dir", type=str, default="/data/output", help="Default log path")
-
-    '''
-        Define num_gpus for training
-
-        Note: DO NOT CHANGE THIS VALUE
-            UCloud Train Job Executor Will Set it Automatically
-    '''
+# Copyright 2017 The UAI-SDK Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+def add_uai_args(parser):
+    uai = parser.add_argument_group('uai-args', 'the UAI related args')
+
+    '''
+        Default work dir. The working dir for the traing job, it will contains:
+            /data/data     --data_dir
+            /data/output   --output_dir
+
+        Note: DO NOT CHANGE THIS VALUE
+            UCloud Train Job Executor Will Set it Automatically
+    '''
+    uai.add_argument('--work_dir', type=str, default="/data", help='Default work path')
+
+    '''
+        Default data path used in Training, all data will be downloaded into this path
+        Please use data in this path as input for Training
+
+        Note: DO NOT CHANGE THIS VALUE
+            UCloud Train Job Executor Will Set it Automatically
+    '''
+    uai.add_argument("--data_dir", type=str, default="/data/data", help="Default data path")
+
+    '''
+        Default output path used in Training, files in this path will be uploaded to UFile 
+        after training finished.
+        You can also assume your checkpoint files inside output_path (If you provided 
+        in the UCloud console), files will also be downloaded into this path befor 
+        Training start
+
+        Note: DO NOT CHANGE THIS VALUE
+            UCloud Train Job Executor Will Set it Automatically
+    '''
+    uai.add_argument("--output_dir", type=str, default="/data/output", help="Default output path")
+
+    '''
+        Default tensorboard output path used in Training, iles in this path will be uploaded to UFile 
+        after training finished.
+        This dir is same as output_dir
+
+        Note: DO NOT CHANGE THIS VALUE
+            UCloud Train Job Executor Will Set it Automatically
+    '''
+    uai.add_argument("--log_dir", type=str, default="/data/output", help="Default log path")
+
+    '''
+        Define num_gpus for training
+
+        Note: DO NOT CHANGE THIS VALUE
+            UCloud Train Job Executor Will Set it Automatically
+    '''
     uai.add_argument("--num_gpus", type=int, help="Num of avaliable gpus")
diff --git a/uaitrain/arch_conf/__init__.py b/uaitrain/arch_conf/__init__.py
diff --git a/uaitrain/arch_conf/tf_conf.py b/uaitrain/arch_conf/tf_conf.py
diff --git a/uaitrain/cmd/__init__.py b/uaitrain/cmd/__init__.py
diff --git a/uaitrain/operation/base_op.py b/uaitrain/operation/base_op.py
@@ -54,16 +54,16 @@ def _add_args(self):
     def _parse_args(self, args):
         self.pub_key = args['public_key']
         self.pri_key = args['private_key']
-        if args['project_id'] != None: 
+        if 'project_id' in args and args['project_id'] != None:
             self.project_id = args['project_id']
         else:
             self.project_id = ""
-        if args['region'] != None:
+        if 'region' in args and args['region'] != None:
             self.region = args['region']
         else:
             self.region = ""
 
-        if args['zone'] != None:
+        if 'zone' in args and args['zone'] != None:
             self.zone = args['zone']
         else:
             self.zone = ""

diff --git a/uaitrain/operation/get_realtime_log/__init__.py b/uaitrain/operation/get_realtime_log/__init__.py
diff --git a/uaitrain/operation/get_realtime_log/base_log_op.py b/uaitrain/operation/get_realtime_log/base_log_op.py
@@ -0,0 +1,93 @@
+# Copyright 2017 The UAI-SDK Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import time
+from uai.utils.logger import uai_logger
+from uai.utils.logger import printConsoleOnlyError
+from uaitrain.operation.base_op import BaseUAITrainOp
+from uaitrain.api.get_train_job_running_log import GetUAITrainRunningLogOp
+from uaitrain.api.get_train_job_list import GetUAITrainJobListOp
+
+class BaseUAITrainGetRealtimeLogOp(BaseUAITrainOp):
+    def __init__(self, parser):
+        super(BaseUAITrainGetRealtimeLogOp, self).__init__(parser)
+        printConsoleOnlyError()
+
+    def _add_job_info_args(self, job_parser):
+        info_parser = job_parser.add_argument_group(
+            'Job Info Params', 'Job Infos')
+        info_parser.add_argument(
+            '--job_id',
+            type=str,
+            required=True,
+            help='The <job_id> to query')
+
+    def _add_args(self):
+        parser = self.parser.add_parser('log', help='Get realtime log of UAI Train Job')
+        self.job_parser = parser
+        self._add_account_args(parser)
+        self._add_job_info_args(parser)
+
+    def _parse_args(self, args):
+        super(BaseUAITrainGetRealtimeLogOp, self)._parse_args(args)
+
+        self.job_id = args['job_id']
+        return True
+
+    def _check_job_running(self):
+        job_op = GetUAITrainJobListOp(
+            pub_key=self.pub_key,
+            priv_key=self.pri_key,
+            job_id=self.job_id,
+            project_id=self.project_id,
+            region=self.region,
+            zone=self.zone)
+
+        succ, resp = job_op.call_api()
+        if succ is False:
+            print("Error get job status info. job {0} ".format(self.job_id))
+            return False
+
+        if resp['DataSet'][0]['Status'] in ['Done', 'Stopped', 'Deleted', 'Error']:
+            return False
+        return True
+
+    def cmd_run(self, args):
+        if self._parse_args(args) == False:
+            return False
+
+        while True:
+            log_op = GetUAITrainRunningLogOp(
+                pub_key=self.pub_key,
+                priv_key=self.pri_key,
+                job_id=self.job_id,
+                project_id=self.project_id,
+                region=self.region,
+                zone=self.zone)
+
+            succ, resp = log_op.call_api()
+            if succ is False:
+                uai_logger.warn("Error get realtime log info. job {0}, check your job_id, it may be not running.".format(self.job_id))
+                time.sleep(10)
+                continue
+            result = resp['RunningLog'] if resp['RunningLog'] is not None else []
+            for log in result:
+                print (log)
+
+            if self._check_job_running() is True:
+                time.sleep(10)
+            else:
+                break
+        return True
diff --git a/uaitrain/operation/get_tensorboard_url/__init__.py b/uaitrain/operation/get_tensorboard_url/__init__.py