Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/SDMX-utils #375

Merged
merged 19 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vtl-sdmx/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
<dependency>
<groupId>io.sdmx</groupId>
<artifactId>fusion-sdmx-ml</artifactId>
<version>1.0.59-SNAPSHOT</version>
<version>1.1.9-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>fr.insee.trevas</groupId>
Expand Down
15 changes: 9 additions & 6 deletions vtl-sdmx/src/main/java/fr/insee/vtl/sdmx/TrevasSDMXUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import io.sdmx.utils.core.io.ReadableDataLocationTmp;

import java.io.InputStream;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collector;
Expand Down Expand Up @@ -117,12 +118,14 @@ public static Structured.DataStructure buildStructureFromSDMX3(SdmxBeans beans,
}

public static Map<String, DataStructureBean> dataflows(SdmxBeans sdmxBeans) {
return sdmxBeans.getDataflows().stream().collect(Collectors.toMap(
INamedBean::getId,
dataflowBean -> sdmxBeans.getDataStructures(dataflowBean.getDataStructureRef())
.stream()
.collect(toSingleton())
));
return sdmxBeans.getDataflows().stream()
.map(df -> sdmxBeans.getDataStructures(df.getDataStructureRef()))
.distinct()
.flatMap(Collection::stream)
.collect(Collectors.toMap(
INamedBean::getId,
dataStructureBean -> dataStructureBean
));
}

public static Map<String, DataStructureBean> vtlMapping(SdmxBeans sdmxBeans) {
Expand Down
48 changes: 22 additions & 26 deletions vtl-sdmx/src/test/java/fr/insee/vtl/BPETest.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,24 +55,23 @@ public void bpeV1() throws ScriptException {
assertThat(bpeDetailDs.getDataStructure().size()).isEqualTo(6);

ScriptContext context = engine.getContext();
context.setAttribute("BPE_DETAIL", bpeDetailDs, ScriptContext.ENGINE_SCOPE);
context.setAttribute("BPE_DETAIL_VTL", bpeDetailDs, ScriptContext.ENGINE_SCOPE);

// Step 1
engine.eval("" +
"define datapoint ruleset UNIQUE_MUNICIPALITY (variable DEPCOM) is\n" +
" MUNICIPALITY_FORMAT_RULE : match_characters(DEPCOM, \"[0-9]{5}|2[A-B][0-9]{3}\") errorcode \"Municipality code is not in the correct format\"\n" +
"end datapoint ruleset;\n" +
"\n" +
"CHECK_MUNICIPALITY := check_datapoint(BPE_DETAIL, UNIQUE_MUNICIPALITY invalid);");
"CHECK_MUNICIPALITY := check_datapoint(BPE_DETAIL_VTL, UNIQUE_MUNICIPALITY invalid);");

Dataset checkMunicipality = (Dataset) engine.getContext().getAttribute("CHECK_MUNICIPALITY");

assertThat(checkMunicipality.getDataPoints()).isEmpty();

// Step 2
engine.eval("BPE_DETAIL_CLEAN := BPE_DETAIL" +
" [drop LAMBERT_X, LAMBERT_Y]\n" +
" [rename ID_EQUIPEMENT to id, TYPEQU to facility_type, DEPCOM to municipality, REF_YEAR to year];");
engine.eval("BPE_DETAIL_CLEAN := BPE_DETAIL_VTL[drop LAMBERT_X, LAMBERT_Y]\n" +
"[rename ID_EQUIPEMENT to id, TYPEQU to facility_type, DEPCOM to municipality, REF_YEAR to year];");

Dataset bpeDetailClean = (Dataset) engine.getContext().getAttribute("BPE_DETAIL_CLEAN");
Structured.DataStructure bpeDetailCleanStructure = bpeDetailClean.getDataStructure();
Expand All @@ -90,8 +89,8 @@ public void bpeV1() throws ScriptException {
assertThat(bpeDetailCleanStructure.get("year").getRole()).isEqualTo(Dataset.Role.ATTRIBUTE);

// Step 3
engine.eval("BPE_MUNICIPALITY <- BPE_DETAIL_CLEAN" +
" [aggr nb := count(id) group by municipality, year, facility_type];");
engine.eval("BPE_MUNICIPALITY <- BPE_DETAIL_CLEAN[aggr nb := count(id) group by municipality, year, facility_type]" +
"[rename year to TIME_PERIOD];");

Dataset bpeMunicipality = (Dataset) engine.getContext().getAttribute("BPE_MUNICIPALITY");
Structured.DataStructure bpeMunicipalityStructure = bpeMunicipality.getDataStructure();
Expand All @@ -102,17 +101,16 @@ public void bpeV1() throws ScriptException {
assertThat(bpeMunicipalityStructure.get("facility_type").getType()).isEqualTo(String.class);
assertThat(bpeMunicipalityStructure.get("facility_type").getRole()).isEqualTo(Dataset.Role.IDENTIFIER);

assertThat(bpeMunicipalityStructure.get("year").getType()).isEqualTo(String.class);
assertThat(bpeMunicipalityStructure.get("year").getRole()).isEqualTo(Dataset.Role.IDENTIFIER);
assertThat(bpeMunicipalityStructure.get("TIME_PERIOD").getType()).isEqualTo(String.class);
assertThat(bpeMunicipalityStructure.get("TIME_PERIOD").getRole()).isEqualTo(Dataset.Role.IDENTIFIER);


assertThat(bpeMunicipalityStructure.get("nb").getType()).isEqualTo(Long.class);
assertThat(bpeMunicipalityStructure.get("nb").getRole()).isEqualTo(Dataset.Role.MEASURE);

// Step 4
engine.eval("BPE_NUTS3 <- BPE_MUNICIPALITY" +
" [calc nuts3 := if substr(municipality,1,2) = \"97\" then substr(municipality,1,3) else substr(municipality,1,2)] \n" +
" [aggr nb := count(nb) group by year, nuts3, facility_type];");
engine.eval("BPE_NUTS3 <- BPE_MUNICIPALITY[calc nuts3 := if substr(municipality,1,2) = \"97\" then substr(municipality,1,3) else substr(municipality,1,2)]\n" +
"[aggr nb := count(nb) group by TIME_PERIOD, nuts3, facility_type];");

Dataset bpeNuts = (Dataset) engine.getContext().getAttribute("BPE_NUTS3");
Structured.DataStructure bpeNutsStructure = bpeNuts.getDataStructure();
Expand All @@ -123,8 +121,8 @@ public void bpeV1() throws ScriptException {
assertThat(bpeNutsStructure.get("facility_type").getType()).isEqualTo(String.class);
assertThat(bpeNutsStructure.get("facility_type").getRole()).isEqualTo(Dataset.Role.IDENTIFIER);

assertThat(bpeNutsStructure.get("year").getType()).isEqualTo(String.class);
assertThat(bpeNutsStructure.get("year").getRole()).isEqualTo(Dataset.Role.IDENTIFIER);
assertThat(bpeNutsStructure.get("TIME_PERIOD").getType()).isEqualTo(String.class);
assertThat(bpeNutsStructure.get("TIME_PERIOD").getRole()).isEqualTo(Dataset.Role.IDENTIFIER);


assertThat(bpeNutsStructure.get("nb").getType()).isEqualTo(Long.class);
Expand All @@ -146,7 +144,7 @@ public void bpeV1() throws ScriptException {
// Step 6
Structured.DataStructure censusStructure = TrevasSDMXUtils.buildStructureFromSDMX3("src/test/resources/DSD_BPE_CENSUS.xml", "LEGAL_POP");

SparkDataset censusNuts = new SparkDataset(
SparkDataset legalPop = new SparkDataset(
spark.read()
.option("header", "true")
.option("delimiter", ";")
Expand All @@ -155,13 +153,12 @@ public void bpeV1() throws ScriptException {
censusStructure
);

context.setAttribute("CENSUS_NUTS3_2021", censusNuts, ScriptContext.ENGINE_SCOPE);
context.setAttribute("LEGAL_POP", legalPop, ScriptContext.ENGINE_SCOPE);

engine.eval("CENSUS_NUTS3_2021 := CENSUS_NUTS3_2021 \n" +
" [rename REF_AREA to nuts3, TIME_PERIOD to year, POP_TOT to pop]\n" +
" [filter year = \"2021\"]\n" +
" [calc pop := cast(pop, integer)]" +
" [drop year, NB_COM, POP_MUNI];");
engine.eval("CENSUS_NUTS3_2021 := LEGAL_POP [rename REF_AREA to nuts3, POP_TOT to pop]\n" +
"[filter TIME_PERIOD = \"2021\"]\n" +
"[calc pop := cast(pop, integer)]\n" +
"[drop TIME_PERIOD, NB_COM, POP_MUNI];");

Dataset censusNuts2021 = (Dataset) engine.getContext().getAttribute("CENSUS_NUTS3_2021");
Structured.DataStructure censusNuts2021Structure = censusNuts2021.getDataStructure();
Expand All @@ -173,9 +170,8 @@ public void bpeV1() throws ScriptException {
assertThat(censusNuts2021Structure.get("pop").getRole()).isEqualTo(Dataset.Role.MEASURE);

// Step 7
engine.eval("GENERAL_PRACT_NUTS3_2021 := BPE_NUTS3" +
" [filter facility_type = \"D201\" and year = \"2021\"]\n" +
" [drop facility_type, year];");
engine.eval("GENERAL_PRACT_NUTS3_2021 := BPE_NUTS3[filter facility_type = \"D201\" and TIME_PERIOD = \"2021\"]\n" +
"[drop facility_type, TIME_PERIOD];");

Dataset generalNuts = (Dataset) engine.getContext().getAttribute("GENERAL_PRACT_NUTS3_2021");
Structured.DataStructure generalNutsStructure = generalNuts.getDataStructure();
Expand All @@ -188,8 +184,8 @@ public void bpeV1() throws ScriptException {

// Step 8
engine.eval("BPE_CENSUS_NUTS3_2021 <- inner_join(GENERAL_PRACT_NUTS3_2021, CENSUS_NUTS3_2021)\n" +
" [calc pract_per_10000_inhabitants := nb / pop * 10000]\n" +
" [drop nb, pop];");
"[calc pract_per_10000_inhabitants := nb / pop * 10000]\n" +
"[drop nb, pop];");

Dataset bpeCensus = (Dataset) engine.getContext().getAttribute("BPE_CENSUS_NUTS3_2021");
Structured.DataStructure bpeCensusStructure = bpeCensus.getDataStructure();
Expand Down
38 changes: 17 additions & 21 deletions vtl-sdmx/src/test/java/fr/insee/vtl/SDMXVTLWorkflowTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,15 @@ public class SDMXVTLWorkflowTest {

@BeforeEach
public void setUp() {
SparkSession.builder()
.appName("test")
.master("local")
.getOrCreate();

ScriptEngineManager mgr = new ScriptEngineManager();
engine = mgr.getEngineByExtension("vtl");

engine.put(VtlScriptEngine.PROCESSING_ENGINE_NAMES, "spark");
}

@Disabled
Expand All @@ -54,16 +60,6 @@ void testRefFromRepo() {

@Test
void testGetEmptyDataset() {

SparkSession.builder()
.appName("test")
.master("local")
.getOrCreate();

ScriptEngineManager mgr = new ScriptEngineManager();
ScriptEngine engine = mgr.getEngineByExtension("vtl");
engine.put(VtlScriptEngine.PROCESSING_ENGINE_NAMES, "spark");

ReadableDataLocation rdl = new ReadableDataLocationTmp("src/test/resources/DSD_BPE_CENSUS.xml");
SDMXVTLWorkflow sdmxVtlWorkflow = new SDMXVTLWorkflow(engine, rdl, Java8Helpers.mapOf());
Map<String, Dataset> emptyDatasets = sdmxVtlWorkflow.getEmptyDatasets();
Expand All @@ -84,13 +80,13 @@ void testGetEmptyDataset() {
new Structured.DataStructure(Java8Helpers.listOf(
new Structured.Component("facility_type", String.class, Dataset.Role.IDENTIFIER),
new Structured.Component("municipality", String.class, Dataset.Role.IDENTIFIER),
new Structured.Component("year", String.class, Dataset.Role.IDENTIFIER),
new Structured.Component("TIME_PERIOD", String.class, Dataset.Role.IDENTIFIER),
new Structured.Component("nb", Long.class, Dataset.Role.MEASURE)
))
);
assertThat(result.get("BPE_NUTS3").getDataStructure()).isEqualTo(
new Structured.DataStructure(Java8Helpers.listOf(
new Structured.Component("year", String.class, Dataset.Role.IDENTIFIER),
new Structured.Component("TIME_PERIOD", String.class, Dataset.Role.IDENTIFIER),
new Structured.Component("facility_type", String.class, Dataset.Role.IDENTIFIER),
new Structured.Component("nuts3", String.class, Dataset.Role.IDENTIFIER),
new Structured.Component("nb", Long.class, Dataset.Role.MEASURE)
Expand All @@ -103,8 +99,8 @@ public void testGetRulesetsVTL() {
ReadableDataLocation rdl = new ReadableDataLocationTmp("src/test/resources/DSD_BPE_CENSUS.xml");
SDMXVTLWorkflow sdmxVtlWorkflow = new SDMXVTLWorkflow(engine, rdl, Java8Helpers.mapOf());
assertThat(sdmxVtlWorkflow.getRulesetsVTL()).isEqualTo(
"define datapoint ruleset UNIQUE_MUNICIPALITY (valuedomain CL_DEPCOM) is\n" +
" MUNICIPALITY_FORMAT_RULE : match_characters(CL_DEPCOM, \"[0-9]{5}|2[A-B][0-9]{3}\") errorcode \"Municipality code is not in the correct format\"\n" +
"define datapoint ruleset UNIQUE_MUNICIPALITY (variable DEPCOM) is\n" +
" MUNICIPALITY_FORMAT_RULE : match_characters(DEPCOM, \"[0-9]{5}|2[A-B][0-9]{3}\") errorcode \"Municipality code is not in the correct format\"\n" +
" end datapoint ruleset;\n" +
"\n" +
"define datapoint ruleset NUTS3_TYPES (variable facility_type, nb) is\n" +
Expand All @@ -126,24 +122,24 @@ public void testGetTransformationsVTL() {
" [rename ID_EQUIPEMENT to id, TYPEQU to facility_type, DEPCOM to municipality, REF_YEAR to year];\n" +
"\n" +
"// BPE aggregation by municipality, type and year\n" +
"BPE_MUNICIPALITY <- BPE_DETAIL_CLEAN [aggr nb := count(id) group by municipality, year, facility_type];\n" +
"BPE_MUNICIPALITY <- BPE_DETAIL_CLEAN [aggr nb := count(id) group by municipality, year, facility_type] [rename year to TIME_PERIOD];\n" +
"\n" +
"// BPE aggregation by NUTS 3, type and year\n" +
"BPE_NUTS3 <- BPE_MUNICIPALITY [calc nuts3 := if substr(municipality,1,2) = \"97\" then substr(municipality,1,3) else substr(municipality,1,2)]\n" +
" [aggr nb := count(nb) group by year, nuts3, facility_type];\n" +
" [aggr nb := count(nb) group by TIME_PERIOD, nuts3, facility_type];\n" +
"\n" +
"// BPE validation of facility types by NUTS 3\n" +
"CHECK_NUTS3_TYPES := check_datapoint(BPE_NUTS3, NUTS3_TYPES invalid);\n" +
"\n" +
"// Prepare 2021 census dataset by NUTS 3\n" +
"CENSUS_NUTS3_2021 := LEGAL_POP [rename REF_AREA to nuts3, TIME_PERIOD to year, POP_TOT to pop]\n" +
" [filter year = \"2021\"]\n" +
"CENSUS_NUTS3_2021 := LEGAL_POP [rename REF_AREA to nuts3, POP_TOT to pop]\n" +
" [filter TIME_PERIOD = \"2021\"]\n" +
" [calc pop := cast(pop, integer)]\n" +
" [drop year, NB_COM, POP_MUNI];\n" +
" [drop TIME_PERIOD, NB_COM, POP_MUNI];\n" +
"\n" +
"// Extract dataset on general practitioners from BPE by NUTS 3 in 2021\n" +
"GENERAL_PRACT_NUTS3_2021 := BPE_NUTS3 [filter facility_type = \"D201\" and year = \"2021\"]\n" +
" [drop facility_type, year];\n" +
"GENERAL_PRACT_NUTS3_2021 := BPE_NUTS3 [filter facility_type = \"D201\" and TIME_PERIOD = \"2021\"]\n" +
" [drop facility_type, TIME_PERIOD];\n" +
"\n" +
"// Merge practitioners and legal population datasets by NUTS 3 in 2021 and compute an indicator\n" +
"BPE_CENSUS_NUTS3_2021 <- inner_join(GENERAL_PRACT_NUTS3_2021, CENSUS_NUTS3_2021)\n" +
Expand Down
Loading
Loading