Aller au contenu

Data processing utils

Utils to process GEOJSON and GeoPackage to add them in the DB.

apply_actions(df, actions)

Apply a sequence of actions to a Geometry.

Parameters:

Name Type Description Default
df GeoDataFrame

GeoDataFrame to apply actions to.

required
actions dict

Actions to apply to the GeoDataFrame.

required

Returns:

Name Type Description
df GeoDataFrame

GeoDataFrame with actions applied.

Source code in back/iarbre_data/utils/data_processing.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def apply_actions(df: gpd.GeoDataFrame, actions: dict) -> gpd.GeoDataFrame:
    """
    Apply a sequence of actions to a Geometry.

    Args:
        df (GeoDataFrame): GeoDataFrame to apply actions to.
        actions (dict): Actions to apply to the GeoDataFrame.

    Returns:
        df (GeoDataFrame): GeoDataFrame with actions applied.
    """
    if actions.get("filter"):
        df = df[df[actions["filter"]["name"]] == actions["filter"]["value"]]
    if actions.get("filters"):
        df = df[
            reduce(
                lambda x, y: x | y,
                [
                    df[filter_["name"]] == filter_["value"]
                    for filter_ in actions["filters"]
                ],
            )
        ]
    if actions.get("exclude"):
        if type(actions["exclude"]["value"]) == list:
            df = df[~df[actions["exclude"]["name"]].isin(actions["exclude"]["value"])]
        else:
            df = df[df[actions["exclude"]["name"]] != actions["exclude"]["value"]]
    if actions.get("explode"):
        df = df.explode(index_parts=False)
    if actions.get("buffer_size"):
        df = df.buffer(actions["buffer_size"])
    if actions.get("buffer"):
        buffer_distances = df[actions["buffer"]["distance_column"]]
        if "_cm" in actions["buffer"]["distance_column"]:
            buffer_distances /= 100
        df = df.buffer(buffer_distances)
    if actions.get("simplify"):
        df = df.simplify(actions["simplify"])
    if actions.get("union"):
        if isinstance(df, gpd.GeoDataFrame):
            df = df["geometry"]
        geometry = unary_union(df)
        df = gpd.GeoDataFrame({"geometry": [geometry]}, crs=TARGET_PROJ)

    # Transform in Polygon
    df = df.explode(index_parts=False)
    df = df[df.geometry.type == "Polygon"]
    return df

batched(iterable, n)

Batch data into tuples of length n. The last batch may be shorter.

Source code in back/iarbre_data/utils/data_processing.py
16
17
18
19
20
21
22
23
def batched(iterable, n) -> None:
    """Batch data into tuples of length n. The last batch may be shorter."""
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch

make_valid(geometry)

Fix minor topology errors in a geometry, such as a Polygon not being closed. Args: geometry (shapely.geometry.base.BaseGeometry): The geometry to be validated. Returns: shapely.geometry.base.BaseGeometry: The validated geometry.

Source code in back/iarbre_data/utils/data_processing.py
130
131
132
133
134
135
136
137
138
139
140
141
142
def make_valid(
    geometry: shapely.geometry.base.BaseGeometry,
) -> shapely.geometry.base.BaseGeometry:
    """
    Fix minor topology errors in a geometry, such as a Polygon not being closed.
    Args:
        geometry (shapely.geometry.base.BaseGeometry): The geometry to be validated.
    Returns:
        shapely.geometry.base.BaseGeometry: The validated geometry.
    """
    if geometry and not geometry.is_valid:
        return geometry.buffer(0)
    return geometry

process_data(df, data_config)

Process geometries.

Parameters:

Name Type Description Default
df GeoDataFrame

GeoDataFrame to apply actions on.

required
data_config dict

Configuration of the data.

required

Returns:

Name Type Description
datas list

Processed data.

Source code in back/iarbre_data/utils/data_processing.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def process_data(df: gpd.GeoDataFrame, data_config: dict) -> list:
    """
    Process geometries.

    Args:
        df (GeoDataFrame): GeoDataFrame to apply actions on.
        data_config (dict): Configuration of the data.

    Returns:
        datas (list): Processed data.
    """
    datas = []
    actions_factors = zip(
        data_config.get("actions", [{}]), data_config["factors"]
    )  # Default actions to None
    for actions, factor in actions_factors:
        log_progress(f"Start actions: {actions}")
        sub_df = apply_actions(df.copy(), actions)
        if len(sub_df) == 0:
            print(f"Factor: {factor} only contained Points")
            continue
        datas += [
            {"geometry": geometry, "factor": factor} for geometry in sub_df.geometry
        ]
    return datas

save_geometries(datas, data_config)

Save geometries to the database.

Parameters:

Name Type Description Default
datas list[dict]

List of dictionaries containing geometries and metadata to save to the database.

required
data_config dict

Configuration of the data.

required

Returns:

Type Description
None

None

Source code in back/iarbre_data/utils/data_processing.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def save_geometries(datas: list[dict], data_config: dict) -> None:
    """
    Save geometries to the database.

    Args:
        datas (list[dict]): List of dictionaries containing geometries and metadata to save to the database.
        data_config (dict): Configuration of the data.

    Returns:
        None
    """
    for ix, batch in enumerate(tqdm(batched(datas, 1000))):
        Data.objects.bulk_create(
            [
                Data(
                    **{
                        **data,
                        "geometry": GEOSGeometry(data["geometry"].wkt),
                        "metadata": data_config["name"],
                    }
                )
                for data in batch
            ]
        )