Skip to content

Commit

Permalink
Exclude duplicate columnns
Browse files Browse the repository at this point in the history
  • Loading branch information
jterry64 committed Mar 7, 2024
1 parent 9593926 commit 7554ee8
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ RUN pip install . -t python
# to change the hash of the file and get TF to realize it needs to be
# redeployed. Ticket for a better solution:
# https://gfw.atlassian.net/browse/GTC-1250
# change 21342344
# change 5

RUN yum install -y zip geos-devel

Expand Down
33 changes: 30 additions & 3 deletions src/datapump/jobs/geotrellis.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,11 @@ def upload(self):
),
cluster=(table.cluster.dict() if table.cluster else table.cluster),
table_schema=table.table_schema,
constraints=table.constraints,
constraints=(
[constraint.dict() for constraint in table.constraints]
if table.constraints
else table.constraints
),
partitions=(
table.partitions.dict()
if table.partitions
Expand Down Expand Up @@ -623,11 +627,34 @@ def _get_constraints(table_schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]
Return uniqueness constraint based off table schema. Anything non-numeric
should be used, since these are fields used for filtering and grouping.
"""
columns_names = [field["name"] for field in table_schema if field["data_type"] != "numeric"]
columns_names = {field["name"] for field in table_schema if field["data_type"] != "numeric"}

# temporarily remove duplicate columns from constraints until we delete them from
# geotrellis, since these put us over the constraint columns limit
duplicate_columns = {
"umd_tree_cover_density__threshold",
"tsc_tree_cover_loss_drivers__type",
"is__birdlife_alliance_for_zero_extinction_site",
"gfw_plantation__type",
"is__gmw_mangroves_1996",
"is__gmw_mangroves_2020",
"is__gfw_tiger_landscape",
"is__landmark_land_right",
"is__gfw_land_right",
"is__birdlife_key_biodiversity_area",
"is__gfw_mining",
"is__peatland",
"is__gfw_resource_right",
"is__gfw_managed_forest",
"is__umd_tree_cover_gain_2000-2012",
"wdpa_protected_area__iucn_cat"
}

constraint_columns = list(columns_names - duplicate_columns)
return [
{
"constraint_type": "unique",
"column_names": columns_names
"column_names": constraint_columns
}
]

Expand Down
2 changes: 1 addition & 1 deletion src/datapump/jobs/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,6 @@ class AnalysisResultTable(BaseModel):
cluster: Optional[Index] = None
partitions: Optional[Partitions] = None
table_schema: List[Dict[str, Any]] = []
constraints: Optional[List[Constraint]]
constraints: Optional[List[Constraint]] = []
latitude_field: str = ""
longitude_field: str = ""

0 comments on commit 7554ee8

Please sign in to comment.