Skip to content

Commit

Permalink
Merge pull request #9 from dondaum/change-structure
Browse files Browse the repository at this point in the history
Change package structure, refactored modules and add full test method support
  • Loading branch information
dondaum authored Jan 6, 2020
2 parents 59c433c + fdd413c commit 55a850a
Show file tree
Hide file tree
Showing 42 changed files with 2,198 additions and 405 deletions.
Binary file modified .DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
*.pyc
#setup
UNKNOWN.egg-info
parsesql/config/configuration.json
parsesql/config/test.json
.github
parser/config/configuration.json
parser/config/test.json
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]
flake8 = "*"

[packages]
sqlalchemy = "*"
Expand Down
31 changes: 26 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,23 @@
A small python based sql parser focusing on finding table dependencies within database views. Currently only working with Snowflake ANSI Sql

The current implementation handles the parser as an seperate app that can be used to parse sql files. This is not a stable version. Within the
next month the goal is to translate that app in a pip package.
next month the goal is to translate that app in a pip package.

The parser works currently only if no AS or as is used in FROM or JOIN conditions, e.g.

```
SELECT
*
FROM a AS oop
```
-> This won't work

However this syntax will work:
```
SELECT
*
FROM a oop
```

## How to use the parser:
1. Download the repository
Expand All @@ -11,12 +27,17 @@ next month the goal is to translate that app in a pip package.
4. Configure the SQLAlchemy engine
5. Create the target database table (Sqllite or Snowflake engine) with SQLAlchemy. Therefore run:
```
cd parsesql/
python -m main.database.init_db
python -m parsesql.main.database.init_db
```
Note: A sqlite file will be placed in the db directory of the package

6. Configure the Runner class (multiprocessing, parsing vs. dataloading)
7. Run the main module with:
```
cd parsesql/
python app.py
python -m parsesql.app
```

For running test use:
```
python -m tests.run_all
```
Binary file modified parsesql/.DS_Store
Binary file not shown.
92 changes: 55 additions & 37 deletions parsesql/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
Expand All @@ -20,39 +20,62 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from main.sql_parser.snowsqlparser import ParseSql
from main.sql_parser.file_finder import FileFinder
from main.database.db_engine import Session
from main.database.models import TableDependency
from main.executers import SequentialExecuter, MultiProcessingExecuter
from parsesql.main.sql_parser.file_finder import FileFinder
from parsesql.main.database.db_engine import Session
from parsesql.main.database.models import TableDependency
from parsesql.main.sql_parser.snowsqlparser import ParseSql
from parsesql.main.executers import SequentialExecuter, MultiProcessingExecuter
import uuid
import time


class Runner(object):
class Runner():
"""
:param bulk_load: if true all insert will hapen in a bulk load else single
INSERTs in the database
:param parallelism: if > 0 then work gets applied on multiple cpus. Notice
this will reduce work on number of cpus on machine minus - 1. So if your
machine has 4 cpus it will run on max 3 cpus concurrently even if 8 is
given as a parameter
"""
def __init__(self, parallelism=0, bulk_load=True):
self.allfiles = None
self.dependencies = list()
self.dependencies = []
self.parallelism = parallelism
self.bulk_load = bulk_load
self.executer = self._get_executer()

def _get_executer(self):
"""
factory class method that return correct executer
"""
if self.parallelism <= 0:
return SequentialExecuter()
else:
return MultiProcessingExecuter()

def parseSql(self) -> None:
self.findFiles()
self.executer.to_parse_files = self.allfiles
result = self.executer.run()
self.dependencies = result

def findFiles(self) -> None:
return MultiProcessingExecuter(cpu_cores=self.parallelism)

def start_sql_parsing(self) -> None:
"""
main sql parsing method that searches for files, prime and config
executer and call run method
"""
self.search_files()
self.executer.target_list = self.allfiles
self.executer.klass = ParseSql
self.executer.klass_method_name = "parse_dependencies"
self.dependencies = self.executer.run()

def search_files(self) -> None:
"""
Instance method that searches all files
"""
self.allfiles = FileFinder().getListOfFiles()

def _data_load(self):
"""
factory method that either single or bulk insert data in the
database
"""
if self.bulk_load:
self._bulkinsertdep()
else:
Expand All @@ -62,11 +85,11 @@ def _insertdep(self) -> None:
session = Session()
for sqlobject in self.dependencies:
for table in sqlobject['tables']:
dbentry = TableDependency( objectName = sqlobject['name'] ,
filename = sqlobject['filename'],
dependentTableName= table,
uuid = str(uuid.uuid1())
)
dbentry = TableDependency(objectName=sqlobject['name'],
filename=sqlobject['filename'],
dependentTableName=table,
uuid=str(uuid.uuid1())
)
session.add(dbentry)
session.commit()
session.close()
Expand All @@ -76,28 +99,23 @@ def _bulkinsertdep(self) -> None:
bulkinsertobjects = list()
for sqlobject in self.dependencies:
for table in sqlobject['tables']:
dbentry = TableDependency( objectName = sqlobject['name'] ,
filename = sqlobject['filename'],
dependentTableName= table,
uuid = str(uuid.uuid1())
)
dbentry = TableDependency(objectName=sqlobject['name'],
filename=sqlobject['filename'],
dependentTableName=table,
uuid=str(uuid.uuid1())
)
bulkinsertobjects.append(dbentry)
session.bulk_save_objects(bulkinsertobjects)
session.commit()
session.close()

def start(self) -> None:
self.parseSql()
self.start_sql_parsing()
self._data_load()


if __name__ == "__main__":
starttime = time.time()
Runner(parallelism=1, bulk_load=True).start()
Runner(parallelism=8, bulk_load=True).start()
endtime = time.time()
print('Time needed:', endtime - starttime )






print('Time needed:', endtime - starttime)
18 changes: 10 additions & 8 deletions parsesql/config/config_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
Expand All @@ -24,17 +24,17 @@
import os
import sys
from pathlib import Path
from util.logger_service import LoggerMixin


class Configuration(LoggerMixin):
def __init__(self, filename:str):
class Configuration():
def __init__(self, filename: str = 'configuration.json'):
self.abspath = os.path.dirname(os.path.abspath(__file__))
self.filename = filename
self.configfilepath = os.path.join(self.abspath, self.filename)
self.data = self.read()
self.sqldir = self.get_sql_directory()
self.file_extension = self.data['file_extension']
self.logger_config = {"Logging": self.data['logging']}
self.strategy = self.data['strategy']
if self.strategy == "snowflake":
self.snowflake_account = self.data['Snowflake_Account']
Expand All @@ -44,8 +44,9 @@ def read(self):
with open(self.configfilepath) as json_data_file:
return json.load(json_data_file)
except FileNotFoundError as e:
self.logger.info(f"Cannot find file {self.filename}. Please check if file existing. "
f"See this error: {e}")
print(f"Cannot find file {self.filename}. "
f"Please check if file existing. "
f"See this error: {e}")
sys.exit()

def get_sql_directory(self):
Expand All @@ -54,4 +55,5 @@ def get_sql_directory(self):
"""
return Path(self.data['sqldirectory'])

Config = Configuration(filename='configuration.json')

Config = Configuration()
13 changes: 0 additions & 13 deletions parsesql/config/configuration.json

This file was deleted.

4 changes: 4 additions & 0 deletions parsesql/config/example_configuration.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,9 @@
"database": "database",
"schema": "schema",
"warehouse": "warehouse"
},
"logging": {
"format": "[%(asctime)s] [%(processName)-10s] [%(name)s] [%(levelname)s] -> %(message)s",
"level": "INFO"
}
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
6 changes: 3 additions & 3 deletions parsesql/main/database/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
Expand All @@ -22,4 +22,4 @@

from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()
Base = declarative_base()
63 changes: 33 additions & 30 deletions parsesql/main/database/db_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
Expand All @@ -20,42 +20,45 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from snowflake.sqlalchemy import URL
from config.config_reader import Config
from parsesql.config.config_reader import Config
from parsesql import db
from sqlalchemy import create_engine
import os

class DatabaseEngine():
def __init__(self):
self.strategy = Config.strategy

def get_engine(self):
if self.strategy == 'sqllite':
return self.get_engine_sqllite()
if self.strategy == 'snowflake':
return self.get_snowflake_engine()

def get_engine_sqllite(self):
return create_engine('sqlite:///parsersql.db', echo=True)

def get_snowflake_engine(self):
return create_engine(URL(
user=Config.snowflake_account['user'],
password= Config.snowflake_account['password'],
account=Config.snowflake_account['account'],
database=Config.snowflake_account['database'],
schema = Config.snowflake_account['schema'],
warehouse = Config.snowflake_account['warehouse']
)
, echo=True
)
DBPATH = os.path.dirname(db.__file__)

db_engine = DatabaseEngine().get_engine()
Session = sessionmaker(bind=db_engine)

class DatabaseEngine():

def __init__(self,
strategy=None
):
self.strategy = strategy or Config.strategy

def get_engine(self):
if self.strategy == 'sqllite':
return self._get_engine_sqllite()
if self.strategy == 'snowflake':
return self._get_snowflake_engine()

def _get_engine_sqllite(self):
dbname = "parsersql.db"
url = os.path.join(DBPATH, dbname)
return create_engine('sqlite:///' + url, echo=True)

def _get_snowflake_engine(self):
return create_engine(URL(
user=Config.snowflake_account['user'],
password=Config.snowflake_account['password'],
account=Config.snowflake_account['account'],
database=Config.snowflake_account['database'],
schema=Config.snowflake_account['schema'],
warehouse=Config.snowflake_account['warehouse']
), echo=True
)


db_engine = DatabaseEngine().get_engine()
Session = sessionmaker(bind=db_engine)
Loading

0 comments on commit 55a850a

Please sign in to comment.