From b08400c7a793d3960b991ade338e676ceb09ab80 Mon Sep 17 00:00:00 2001 From: Shawn Date: Fri, 18 Nov 2022 10:13:31 -0500 Subject: [PATCH] Update MANIFEST.in --- MANIFEST.in | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index 9561fb1..0a814ab 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,45 @@ include README.rst +010747 + +from loanElig.loanEngineering.features.loanEligibility import spark_app +from pyspark.sql import functions as F +spark = spark_app() + + +def table_identifier(col): + col = col.map(lambda x: x[1]).flatmap(lambda x: x.split(' ')) + col_cnt = col.map(lambda x : (x,1)).reducebyKey(lambda a,b : a+b) + return col_cnt + + + +def replace_special_chars(col): + list1 =['select','where','group','by', 'order'] + for i in list1: + col = col.map(lambda x: x.replace(x,i,"")) + return col + + +df = spark.read.option("multiline","true").json('sample.json') + +# df = df.select('state', F.regexp_replace(F.col("state"), "[_\"\'():;,.!?\\-]", " ").alias("table_list")) +# Word_count = df.groupBy('table_list').count() +# Word_count.orderBy(F.col('count').desc()).show(10) +list1 =['select','where','group','by', 'order','*','in',"[",'(',']',')'] +df = df.withColumn('cnt',F.regexp_replace(F.split(F.col('state'), 'from'),",",""))\ + .withColumn('cnt2',F.explode(F.split(F.col('cnt'), " "))) + +# replace_pat = lambda x: x.replace(i,"") for i in list1 +def repl(col): + for i in list1: + col = col.replace(i,"") + return col + +rep1 = F.udf(repl) + +df = df.withColumn('cnt2',rep1(F.col('cnt2')))\ + .filter((F.col('cnt2').isNotNull()) | (F.col('cnt2') != ""))\ + .groupBy('cnt2')\ + .count() + +df.show()