From db67c10e40e3efbefc2c41da5871b73cc205ee21 Mon Sep 17 00:00:00 2001
From: Timur <timur.vakhitov@firstlinesoftware.com>
Date: Fri, 9 Sep 2022 16:29:49 +0300
Subject: [PATCH] new function GetSpreadSheet added [AVOF-3651]

---
 .../GetSpreadSheet.sql                        | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 working/packages/DevV5_additional_functions/GetSpreadSheet.sql

diff --git a/working/packages/DevV5_additional_functions/GetSpreadSheet.sql b/working/packages/DevV5_additional_functions/GetSpreadSheet.sql
new file mode 100644
index 000000000..2fdc27058
--- /dev/null
+++ b/working/packages/DevV5_additional_functions/GetSpreadSheet.sql
@@ -0,0 +1,64 @@
+CREATE OR REPLACE FUNCTION devv5.GetSpreadSheet (url text)
+RETURNS SETOF record
+AS
+$BODY$
+  '''
+  The function allows you to get a standard Google spreadsheet as a sql query:
+  SELECT * FROM devv5.GetSpreadSheet (spreadsheet_url) AS (id int4, column1_name column1_type, column2_name column2_type [, ...]);
+  
+  Since these tables can have a different number of columns and different names, it is necessary to specify the all output fields and their type in each specific case; at the same time, the names of the columns in the spreadsheet and at the output of the function must match.
+  The function also displays the pseudo-column "id" with sequential row numbering (starting from the second row, because the first is the table header).
+  
+  Example
+  Let's say we have a spreadsheet https://docs.google.com/spreadsheets/d/1dJqEfmquPioOKPTKbTUmhCM1v12PYiB79gIx9baH6wM/edit#gid=0 with the following columns:
+  source_description	source_code	cnt	unit	percentile_5	percentile_95	flag	comment	target_concept_id	concept_code	concept_name	concept_class_id	standard_concept	invalid_reason	domain_id	target_vocabulary_id
+  then we need to write the following query:
+  SELECT * FROM devv5.GetSpreadSheet('https://docs.google.com/spreadsheets/d/1dJqEfmquPioOKPTKbTUmhCM1v12PYiB79gIx9baH6wM/edit#gid=0')
+  AS (
+    id int4,
+    source_description text,
+    source_code text,
+    cnt	int4,
+    unit text,
+    percentile_5 text,
+    percentile_95 text,
+    flag text,
+    comment text,
+    target_concept_id text,
+    concept_code text,
+    concept_name text,
+    concept_class_id text,
+    standard_concept text,
+    invalid_reason text,
+    domain_id text,
+    target_vocabulary_id text
+    );
+
+  And of course, you can do whatever you need with this query - add WHERE conditions, wrap it in CREATE TABLE, etc. Moreover, you can even change the position of the columns, for example:
+  SELECT * FROM devv5.GetSpreadSheet('https://docs.google.com/spreadsheets/d/1dJqEfmquPioOKPTKbTUmhCM1v12PYiB79gIx9baH6wM/edit#gid=0')
+  AS (
+    id int4,
+    domain_id text,
+    target_vocabulary_id text,
+    source_description text,
+    ...
+    );
+  
+  Note: don't forget to give view permissions to the service email
+  Note: since the spreadsheet request is in real time over HTTPS, do not use this function in joins, first create a local table, then work with it
+  Note: the function only works with the first worksheet
+  '''
+  import gspread
+
+  res = []
+  credentials = eval(plpy.execute("SELECT var_value::json FROM config$ where var_name='gspread_credentials'")[0]['var_value'])
+  gc = gspread.service_account_from_dict(credentials)
+  sh = gc.open_by_url(url)
+  #get first worksheet, all records, replace all empty cells with null
+  res = sh.get_worksheet(0).get_all_records(default_blank=None)
+  #add row number
+  res = [{**{'id':id + 2}, **row} for id, row in enumerate(res)]
+  
+  return res
+$BODY$
+LANGUAGE 'plpython3u' STRICT;
\ No newline at end of file