Skip to content

Reference

Here you will find the reference for all available functions in wiutils. Each function has its signature, a short explanation of its purpose and a description of both its arguments and return values.

Note

You will see that functions are divided into submodules (e.g. reading or filtering). For convenience, you can execute all the functions without accessing their submodules. For example, instead of executing wiutils.filtering.remove_duplicates() you can just execute wiutils.remove_duplicates.

wiutils.darwincore

Functions to create different core and extension tables following the Darwin Core (DwC) standard from a Wildlife Insights data.

create_dwc_archive(cameras, deployments, images, projects, remove_duplicate_kws=None)

Creates a Darwin Core Archive consisting of four different cores and extensions: Event, Occurrence, Measurement or Facts and Simple Multimedia.

Parameters:

Name Type Description Default
cameras DataFrame

Dataframe with the bundle's cameras.

required
deployments DataFrame

Dataframe with the bundle's deployments.

required
images DataFrame

Dataframe with the bundle's cameras.

required
projects DataFrame

Dataframe with the bundle's projects.

required
remove_duplicate_kws dict

Keyword arguments passed to the wiutils.remove_duplicate function. Used for the creation of the Occurrence Core.

None

Returns:

Type Description
DataFrame

Darwin Core Event dataframe.

DataFrame

Darwin Core Occurrence dataframe.

DataFrame

Darwin Core Measurement or Facts dataframe.

DataFrame

Darwin Core Simple Multimedia dataframe.

Source code in wiutils/darwincore.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def create_dwc_archive(
    cameras: pd.DataFrame,
    deployments: pd.DataFrame,
    images: pd.DataFrame,
    projects: pd.DataFrame,
    remove_duplicate_kws: dict = None,
) -> tuple:
    """
    Creates a Darwin Core Archive consisting of four different cores and
    extensions: Event, Occurrence, Measurement or Facts and Simple
    Multimedia.

    Parameters
    ----------
    cameras : DataFrame
        Dataframe with the bundle's cameras.
    deployments : DataFrame
        Dataframe with the bundle's deployments.
    images : DataFrame
        Dataframe with the bundle's cameras.
    projects : DataFrame
        Dataframe with the bundle's projects.
    remove_duplicate_kws : dict
        Keyword arguments passed to the wiutils.remove_duplicate function.
        Used for the creation of the Occurrence Core.

    Returns
    -------
    DataFrame
        Darwin Core Event dataframe.
    DataFrame
        Darwin Core Occurrence dataframe.
    DataFrame
        Darwin Core Measurement or Facts dataframe.
    DataFrame
        Darwin Core Simple Multimedia dataframe.

    """
    event = create_dwc_event(deployments, projects)
    occurrence = create_dwc_occurrence(
        images, deployments, projects, remove_duplicate_kws
    )
    measurement = create_dwc_measurement(cameras, deployments)
    multimedia = create_dwc_multimedia(images, deployments)

    return event, occurrence, measurement, multimedia

create_dwc_event(deployments, projects)

Creates a Darwin Core Event dataframe from deployments and projects information. See https://rs.gbif.org/core/dwc_event_2022-02-02.xml for more information about this core.

Parameters:

Name Type Description Default
deployments DataFrame

Dataframe with the bundle's deployments.

required
projects DataFrame

Dataframe with the bundle's projects.

required

Returns:

Type Description
DataFrame

Darwin Core Event dataframe.

Source code in wiutils/darwincore.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def create_dwc_event(
    deployments: pd.DataFrame,
    projects: pd.DataFrame,
) -> pd.DataFrame:
    """
    Creates a Darwin Core Event dataframe from deployments and projects
    information. See https://rs.gbif.org/core/dwc_event_2022-02-02.xml
    for more information about this core.

    Parameters
    ----------
    deployments : DataFrame
        Dataframe with the bundle's deployments.
    projects : DataFrame
        Dataframe with the bundle's projects.

    Returns
    -------
    DataFrame
        Darwin Core Event dataframe.

    """
    df = pd.merge(deployments, projects, on=_labels.deployments.project_id, how="left")
    df[_labels.deployments.start] = pd.to_datetime(df[_labels.deployments.start])
    df[_labels.deployments.end] = pd.to_datetime(df[_labels.deployments.end])

    core = df.rename(columns=_dwc.event.mapping)
    core = core[core.columns[core.columns.isin(_dwc.event.order)]]

    for term, value in _dwc.event.constants.items():
        core[term] = value

    delta = df[_labels.deployments.end] - df[_labels.deployments.start]
    core["samplingEffort"] = delta.dt.days.astype(str) + " trap-nights"

    core["eventDate"] = (
        df[_labels.deployments.start].dt.strftime("%Y-%m-%d")
        + "/"
        + df[_labels.deployments.end].dt.strftime("%Y-%m-%d")
    )

    with open(pathlib.Path(__file__).parent.joinpath("_dwc/countries.json")) as f:
        countries = pd.DataFrame(json.load(f))
        core["countryCode"] = core["countryCode"].map(
            countries.set_index("alpha-3")["alpha-2"]
        )

    core = core.reindex(columns=_dwc.event.order)

    return core

create_dwc_measurement(deployments, cameras)

Creates a Darwin Core Measurement or Facts dataframe from cameras and deployments information. See https://rs.gbif.org/extension/dwc/measurements_or_facts_2022-02-02.xml for more information about this extension.

Parameters:

Name Type Description Default
deployments DataFrame

Dataframe with the bundle's deployments.

required
cameras DataFrame

Dataframe with the bundle's cameras.

required

Returns:

Type Description
DataFrame

Darwin Core Measurement or Facts dataframe.

Source code in wiutils/darwincore.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def create_dwc_measurement(
    deployments: pd.DataFrame,
    cameras: pd.DataFrame,
) -> pd.DataFrame:
    """
    Creates a Darwin Core Measurement or Facts dataframe from cameras and
    deployments information. See https://rs.gbif.org/extension/dwc/measurements_or_facts_2022-02-02.xml
    for more information about this extension.

    Parameters
    ----------
    deployments : DataFrame
        Dataframe with the bundle's deployments.
    cameras : DataFrame
        Dataframe with the bundle's cameras.

    Returns
    -------
    DataFrame
        Darwin Core Measurement or Facts dataframe.

    """
    df = pd.merge(deployments, cameras, on=_labels.deployments.camera_id, how="left")

    extension = pd.DataFrame()
    for item in _dwc.measurement.mapping:
        temp = pd.DataFrame()
        temp["eventID"] = df.loc[:, _labels.deployments.deployment_id]
        temp["measurementType"] = item["type"]
        temp["measurementValue"] = df.loc[:, item["value"]]
        temp["measurementUnit"] = item["unit"]
        if item["remarks"]:
            temp["measurementRemarks"] = df.loc[:, item["remarks"]]
        else:
            temp["measurementRemarks"] = np.nan
        extension = pd.concat([extension, temp], ignore_index=True)

    extension = extension.dropna(subset=["measurementValue"]).reset_index(drop=True)

    return extension

create_dwc_multimedia(images, deployments)

Creates a Darwin Core Simple Multimedia dataframe from images and deployments information. See https://rs.gbif.org/extension/gbif/1.0/multimedia.xml for more information about this extension. The result includes information from all the bundle's images.

Parameters:

Name Type Description Default
images DataFrame

Dataframe with the bundle's images.

required
deployments DataFrame

Dataframe with the bundle's deployments.

required

Returns:

Type Description
DataFrame

Darwin Core Simple Multimedia dataframe.

Source code in wiutils/darwincore.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def create_dwc_multimedia(
    images: pd.DataFrame, deployments: pd.DataFrame
) -> pd.DataFrame:
    """
    Creates a Darwin Core Simple Multimedia dataframe from images and
    deployments information. See https://rs.gbif.org/extension/gbif/1.0/multimedia.xml
    for more information about this extension. The result includes
    information from all the bundle's images.

    Parameters
    ----------
    images : DataFrame
        Dataframe with the bundle's images.
    deployments : DataFrame
        Dataframe with the bundle's deployments.

    Returns
    -------
    DataFrame
        Darwin Core Simple Multimedia dataframe.

    """
    df = pd.merge(images, deployments, on=_labels.images.deployment_id, how="left")
    df[_labels.images.url] = _gs_to_https(df[_labels.images.url])

    extension = df.rename(columns=_dwc.multimedia.mapping)
    extension = extension[
        extension.columns[extension.columns.isin(_dwc.multimedia.order)]
    ]

    for term, value in _dwc.multimedia.constants.items():
        extension[term] = value

    extension["title"] = get_lowest_taxon(images, return_rank=False).fillna(
        "Blank or unidentified"
    )

    extension = extension.reindex(columns=_dwc.multimedia.order)

    return extension

create_dwc_occurrence(images, deployments, projects, remove_duplicate_kws=None)

Creates a Darwin Core Occurrence dataframe from images, deployments and projects information. See https://rs.gbif.org/core/dwc_occurrence_2022-02-02.xml for more information about this core. The result includes only wildlife records (i.e. unidentified and duplicate images are removed).

Parameters:

Name Type Description Default
images DataFrame

Dataframe with the bundle's images.

required
deployments DataFrame

Dataframe with the bundle's deployments.

required
projects DataFrame

Dataframe with the bundle's projects.

required
remove_duplicate_kws dict

Keyword arguments passed to the wiutils.remove_duplicate function.

None

Returns:

Type Description
DataFrame

Darwin Core Occurrence dataframe.

Source code in wiutils/darwincore.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
def create_dwc_occurrence(
    images: pd.DataFrame,
    deployments: pd.DataFrame,
    projects: pd.DataFrame,
    remove_duplicate_kws: dict = None,
) -> pd.DataFrame:
    """
    Creates a Darwin Core Occurrence dataframe from images, deployments
    and projects information. See https://rs.gbif.org/core/dwc_occurrence_2022-02-02.xml
    for more information about this core. The result includes only
    wildlife records (i.e. unidentified and duplicate images are removed).

    Parameters
    ----------
    images : DataFrame
        Dataframe with the bundle's images.
    deployments : DataFrame
        Dataframe with the bundle's deployments.
    projects : DataFrame
        Dataframe with the bundle's projects.
    remove_duplicate_kws : dict
        Keyword arguments passed to the wiutils.remove_duplicate function.

    Returns
    -------
    DataFrame
        Darwin Core Occurrence dataframe.

    """
    if remove_duplicate_kws is None:
        remove_duplicate_kws = {}
    remove_duplicate_kws.update({"reset_index": False})

    images = images.copy()
    images = remove_unidentified(images, rank="class", reset_index=True)
    filtered = remove_duplicates(images, **remove_duplicate_kws)

    df = pd.merge(
        filtered,
        deployments.drop(columns=_labels.deployments.project_id),
        on=_labels.images.deployment_id,
        how="left",
    )
    df = pd.merge(df, projects, on=_labels.images.project_id, how="left")
    df[_labels.images.date] = pd.to_datetime(df[_labels.images.date])

    core = df.rename(columns=_dwc.occurrence.mapping)
    core = core[core.columns[core.columns.isin(_dwc.occurrence.order)]]

    for term, value in _dwc.occurrence.constants.items():
        core[term] = value

    core["eventDate"] = df[_labels.images.date].dt.strftime("%Y-%m-%d")
    core["eventTime"] = df[_labels.images.date].dt.strftime("%H:%M:%S")

    images.loc[filtered.index, "__seq"] = np.arange(len(filtered))
    images["__seq"] = images["__seq"].fillna(method="ffill")
    images[_labels.images.url] = _gs_to_https(images[_labels.images.url])
    core["associatedMedia"] = images.groupby("__seq").agg(
        {_labels.images.url: "|".join}
    )

    filtered = filtered.reset_index(drop=True)
    taxa, ranks = get_lowest_taxon(filtered, return_rank=True)
    epithets = filtered[_labels.images.species].str.split(" ", expand=True)
    core["scientificName"] = taxa
    core["taxonRank"] = ranks
    core["specificEpithet"] = epithets[0]
    core["infraspecificEpithet"] = epithets.get(1, np.nan)

    core = core.reindex(columns=_dwc.occurrence.order)

    return core

wiutils.extraction

Functions for extracting information from WI tables.

get_date_ranges(images=None, deployments=None, source='both', compute_delta=False, pivot=False)

Gets deployment date ranges using information from either images, deployments or both.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

None
deployments DataFrame

DataFrame with the project's deployments.

None
source bool

Source to plot date ranges from: Values can be:

  • 'images' to plot date ranges from images (i.e. first image to last image taken).
  • 'deployments' to plot date ranges from deployments information (i.e. start date and end date).
  • 'both' to plot both sources in two different subplots.
'both'
compute_delta bool

Whether to compute the delta (in days) between the start and end dates.

False
pivot bool

Whether to pivot (reshape from long to wide format) the resulting DataFrame.

False

Returns:

Type Description
DataFrame

DataFrame with date ranges.

Source code in wiutils/extraction.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def get_date_ranges(
    images: pd.DataFrame = None,
    deployments: pd.DataFrame = None,
    source: str = "both",
    compute_delta: bool = False,
    pivot: bool = False,
) -> pd.DataFrame:
    """
    Gets deployment date ranges using information from either images,
    deployments or both.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    deployments : DataFrame
        DataFrame with the project's deployments.
    source : bool
        Source to plot date ranges from: Values can be:

            - 'images' to plot date ranges from images (i.e. first image
            to last image taken).
            - 'deployments' to plot date ranges from deployments
            information (i.e. start date and end date).
            - 'both' to plot both sources in two different subplots.
    compute_delta : bool
        Whether to compute the delta (in days) between the start and end
        dates.
    pivot : bool
        Whether to pivot (reshape from long to wide format) the resulting
        DataFrame.

    Returns
    -------
    DataFrame
        DataFrame with date ranges.

    """
    df = pd.DataFrame()

    if source == "images" or source == "both":
        if images is None:
            raise ValueError("images DataFrame must be provided.")
        images = images.copy()
        images[_labels.images.date] = pd.to_datetime(images[_labels.images.date])
        images[_labels.images.date] = pd.to_datetime(
            images[_labels.images.date].dt.date
        )
        dates = images.groupby(_labels.images.deployment_id)[_labels.images.date].agg(
            start_date="min", end_date="max"
        )
        dates["source"] = "images"
        df = pd.concat([df, dates.reset_index()], ignore_index=True)

    if source == "deployments" or source == "both":
        if deployments is None:
            raise ValueError("deployments DataFrame must be provided.")
        deployments = deployments.copy()
        deployments = deployments.sort_values(_labels.deployments.deployment_id)
        deployments[_labels.deployments.start] = pd.to_datetime(
            deployments[_labels.deployments.start]
        )
        deployments[_labels.deployments.end] = pd.to_datetime(
            deployments[_labels.deployments.end]
        )
        dates = deployments.loc[
            :,
            [
                _labels.deployments.deployment_id,
                _labels.deployments.start,
                _labels.deployments.end,
            ],
        ]
        dates["source"] = "deployments"
        df = pd.concat([df, dates], ignore_index=True)

    if source not in ("images", "deployments", "both"):
        raise ValueError("source must be one of ['images', 'deployments', 'both']")

    if compute_delta:
        delta = df["end_date"] - df["start_date"]
        df["delta"] = delta.dt.days

    if pivot:
        df = df.pivot(index="deployment_id", columns="source")

    return df

get_lowest_taxon(images, return_rank=False)

Gets the lowest identified taxa and ranks.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
return_rank bool

Whether to return the lowest identified ranks.

False

Returns:

Type Description
Series

Lowest identified taxon for each image.

Series

Lowest identified rank for each image.

Source code in wiutils/extraction.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def get_lowest_taxon(
    images: pd.DataFrame, return_rank: bool = False
) -> Union[pd.Series, tuple]:
    """
    Gets the lowest identified taxa and ranks.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    return_rank : bool
        Whether to return the lowest identified ranks.

    Returns
    -------
    Series
        Lowest identified taxon for each image.
    Series
        Lowest identified rank for each image.

    """
    ranks = _utils.taxonomy.compute_taxonomic_rank(images)
    taxa = get_scientific_name(images, keep_genus=False, add_qualifier=False)

    mask = (taxa.isna()) & (ranks.notna())
    sorted_columns = np.argsort(images.columns)
    column_indices = np.searchsorted(images.columns[sorted_columns], ranks.loc[mask])
    indices = sorted_columns[column_indices]
    taxa.loc[mask] = images.loc[mask].values[np.arange(mask.sum()), indices]

    if return_rank:
        return taxa, ranks
    else:
        return taxa

get_scientific_name(images, keep_genus=False, add_qualifier=False)

Gets the scientific name of each image by concatenating their respective genus and specific epithet.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
keep_genus bool

Whether to keep the genus as the scientific name in images where only the genus was identified. If False, the scientific name for those cases will be emtpy.

False
add_qualifier bool

Whether to add an open nomenclature qualifier (sp.) to the scientific name of those cases where only the genus was identified. Only has effect if keep_genus is True.

False

Returns:

Type Description
Series

Series with the corresponding scientific names.

Source code in wiutils/extraction.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def get_scientific_name(
    images: pd.DataFrame,
    keep_genus: bool = False,
    add_qualifier: bool = False,
) -> pd.Series:
    """
    Gets the scientific name of each image by concatenating their
    respective genus and specific epithet.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    keep_genus: bool
        Whether to keep the genus as the scientific name in images where
        only the genus was identified. If False, the scientific name for
        those cases will be emtpy.
    add_qualifier
        Whether to add an open nomenclature qualifier (sp.) to the
        scientific name of those cases where only the genus was
        identified. Only has effect if keep_genus is True.

    Returns
    -------
    Series
        Series with the corresponding scientific names.

    """
    names = pd.Series(np.nan, index=images.index, dtype=str)

    exclude = ["No CV Result", "Unknown"]
    has_genus = (
        ~images[_labels.images.genus].isin(exclude)
        & images[_labels.images.genus].notna()
    )
    has_epithet = (
        ~images[_labels.images.species].isin(exclude)
        & images[_labels.images.species].notna()
    )

    mask = has_genus & has_epithet
    names.loc[mask] = (
        images.loc[mask, _labels.images.genus]
        + " "
        + images.loc[mask, _labels.images.species]
    )

    if keep_genus:
        mask = has_genus & ~has_epithet
        names.loc[mask] = images.loc[mask, _labels.images.genus]
        if add_qualifier:
            names.loc[mask] += " sp."

    return names

wiutils.filtering

Functions to filter WI images based on different conditions.

remove_domestic(images, broad=False, reset_index=False)

Removes images where the identification corresponds to a domestic species. See wiutils/_domestic.py for a list of the species considered as domestic.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
broad bool

Whether to use a broader strategy when removing domestic species. A broader strategy takes the genera from the list of domestic species and removes the images where the genus identification is in that list. Otherwise, the scientific name for each image is extracted and the images where the scientific name is in the list of domestic species are removed.

False
reset_index bool

Whether to reset the index of the resulting DataFrame. If True, the index will be numeric from 0 to the length of the result.

False

Returns:

Type Description
DataFrame

Copy of images with removed domestic species.

Source code in wiutils/filtering.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def remove_domestic(
    images: pd.DataFrame, broad: bool = False, reset_index: bool = False
) -> pd.DataFrame:
    """
    Removes images where the identification corresponds to a domestic
    species. See wiutils/_domestic.py for a list of the species
    considered as domestic.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    broad : bool
        Whether to use a broader strategy when removing domestic species.
        A broader strategy takes the genera from the list of domestic
        species and removes the images where the genus identification
        is in that list. Otherwise, the scientific name for each image
        is extracted and the images where the scientific name is in the
        list of domestic species are removed.
    reset_index : bool
        Whether to reset the index of the resulting DataFrame. If True,
        the index will be numeric from 0 to the length of the result.

    Returns
    -------
    DataFrame
        Copy of images with removed domestic species.

    """
    images = images.copy()

    if broad:
        genera = pd.Series(_domestic.species).str.split(" ").str[0].drop_duplicates()
        images = images[~images[_labels.images.genus].isin(genera)]
    else:
        names = get_scientific_name(images, keep_genus=False)
        images = images[~names.isin(_domestic.species)]

    if reset_index:
        images = images.reset_index(drop=True)

    return images

remove_duplicates(images, interval=30, unit='minutes', reset_index=False)

Removes duplicate records (images) from the same taxon in the same deployment given a time interval.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
interval int

Time interval (for a specific time unit).

30
unit str

Time unit. Possible values are:

  • 'weeks'
  • 'days'
  • 'hours'
  • 'minutes'
  • 'seconds'
'minutes'
reset_index bool

Whether to reset the index of the resulting DataFrame. If True, the index will be numeric from 0 to the length of the result.

False

Returns:

Type Description
DataFrame

Copy of images with removed duplicates.

Source code in wiutils/filtering.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def remove_duplicates(
    images: pd.DataFrame,
    interval: int = 30,
    unit: str = "minutes",
    reset_index: bool = False,
) -> pd.DataFrame:
    """
    Removes duplicate records (images) from the same taxon in the same
    deployment given a time interval.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    interval : int
        Time interval (for a specific time unit).
    unit : str
        Time unit. Possible values are:

            - 'weeks'
            - 'days'
            - 'hours'
            - 'minutes'
            - 'seconds'
    reset_index : bool
        Whether to reset the index of the resulting DataFrame. If True,
        the index will be numeric from 0 to the length of the result.

    Returns
    -------
    DataFrame
        Copy of images with removed duplicates.

    """
    if unit not in ("weeks", "days", "hours", "minutes", "seconds"):
        raise ValueError(
            "unit must be one of ['weeks', 'days', 'hours', 'minutes', 'seconds']"
        )

    images = images.copy()
    images["taxon"] = get_lowest_taxon(images, return_rank=False)

    df = images.copy()
    df[_labels.images.date] = pd.to_datetime(df[_labels.images.date])

    df = df.sort_values([_labels.images.deployment_id, "taxon", _labels.images.date])
    delta = df.groupby([_labels.images.deployment_id, "taxon"])[
        _labels.images.date
    ].diff()
    mask = (delta >= pd.Timedelta(**{unit: interval})) | (delta.isna())

    images_reference = images.dropna(subset=["taxon"])
    images_reference = images_reference.sort_values(
        [_labels.images.deployment_id, "taxon", _labels.images.date]
    )
    df = images_reference.loc[mask]
    df = pd.concat([df, images[images["taxon"].isna()]])
    df = df.reindex(images.index.intersection(df.index))

    if reset_index:
        df = df.reset_index(drop=True)

    df = df.drop(columns="taxon")

    return df

remove_inconsistent_dates(images, deployments, reset_index=False)

Removes images where the timestamp is outside the date range of the corresponding deployment.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
deployments pd.DataFrame

DataFrame with the project's deployments.

required
reset_index bool

Whether to reset the index of the resulting DataFrame. If True, the index will be numeric from 0 to the length of the result.

False

Returns:

Type Description
DataFrame

Images DataFrame with removed inconsistent images.

Source code in wiutils/filtering.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def remove_inconsistent_dates(
    images: pd.DataFrame, deployments: pd.DataFrame, reset_index: bool = False
) -> pd.DataFrame:
    """
    Removes images where the timestamp is outside the date range of the
    corresponding deployment.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    deployments : pd.DataFrame
        DataFrame with the project's deployments.
    reset_index : bool
        Whether to reset the index of the resulting DataFrame. If True,
        the index will be numeric from 0 to the length of the result.

    Returns
    -------
    DataFrame
        Images DataFrame with removed inconsistent images.

    """
    df = images.copy()
    deployments = deployments.copy()

    df[_labels.images.date] = pd.to_datetime(df[_labels.images.date])
    deployments[_labels.deployments.start] = pd.to_datetime(
        deployments[_labels.deployments.start]
    )
    deployments[_labels.deployments.end] = pd.to_datetime(
        deployments[_labels.deployments.end]
    )

    df[_labels.images.date] = pd.to_datetime(df[_labels.images.date].dt.date)
    df = pd.merge(
        df,
        deployments[
            [
                _labels.deployments.deployment_id,
                _labels.deployments.start,
                _labels.deployments.end,
            ]
        ],
        on=_labels.images.deployment_id,
        how="left",
    )
    df["__is_between"] = df[_labels.images.date].between(
        df[_labels.deployments.start], df[_labels.deployments.end]
    )
    df = images[df["__is_between"]]

    if reset_index:
        df = df.reset_index(drop=True)

    return df

remove_unidentified(images, rank='genus', reset_index=False)

Removes unidentified (up to a specific taxonomic rank) images.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
rank str

Taxonomic rank for which images that do not have an identification will be removed. Possible values are:

  • 'species'
  • 'genus'
  • 'family'
  • 'order'
  • 'class'

For example, if rank is 'family', all images where the family (and therefore the inferior ranks - genus and epithet -) were not identified will be removed.

'genus'
reset_index bool

Whether to reset the index of the resulting DataFrame. If True, the index will be numeric from 0 to the length of the result.

False

Returns:

Type Description
DataFrame

Images DataFrame with removed unidentified images.

Source code in wiutils/filtering.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def remove_unidentified(
    images: pd.DataFrame, rank: str = "genus", reset_index: bool = False
) -> pd.DataFrame:
    """
    Removes unidentified (up to a specific taxonomic rank) images.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    rank : str
        Taxonomic rank for which images that do not have an identification
        will be removed. Possible values are:

            - 'species'
            - 'genus'
            - 'family'
            - 'order'
            - 'class'

        For example, if rank is 'family', all images where the family
        (and therefore the inferior ranks - genus and epithet -) were
        not identified will be removed.
    reset_index : bool
        Whether to reset the index of the resulting DataFrame. If True,
        the index will be numeric from 0 to the length of the result.

    Returns
    -------
    DataFrame
        Images DataFrame with removed unidentified images.

    """
    images = images.copy()

    taxonomy_columns = _utils.taxonomy.get_taxonomy_columns(rank)
    exclude = ["No CV Result", "Unknown"]
    images[taxonomy_columns] = images[taxonomy_columns].replace(exclude, np.nan)
    images = images.dropna(subset=taxonomy_columns, how="all")

    if reset_index:
        images = images.reset_index(drop=True)

    return images

wiutils.preprocessing

Functions to preprocess information before uploading it to WI.

change_image_timestamp(image_path, output_path, timestamp=None, offset=None)

Changes an image's associated timestamp metadata for a new timestamp. This can be a new arbitrary timestamp or a computed new timestamp from an offset and the original timestamp.

Parameters:

Name Type Description Default
image_path str or Path

Relative or absolute path of the image to resample.

required
output_path str or Path

Relative or absolute path of the output image.

required
timestamp str, datetime.datetime or pd.Timestamp

New timestamp to write to the image's metadata.

None
offset DateOffset or Timedelta

Offset or Timedelta to add to (if positive) or subtract from (if negative) the original image's timestamp. This argument only has effect when no timestamp is specified.

None

Returns:

Type Description
None
Source code in wiutils/preprocessing.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def change_image_timestamp(
    image_path: Union[str, pathlib.Path],
    output_path: Union[str, pathlib.Path],
    timestamp: Union[str, datetime.datetime, pd.Timestamp] = None,
    offset: Union[pd.DateOffset, pd.Timedelta] = None,
) -> None:
    """
    Changes an image's associated timestamp metadata for a new timestamp.
    This can be a new arbitrary timestamp or a computed new timestamp from
    an offset and the original timestamp.

    Parameters
    ----------
    image_path : str or Path
        Relative or absolute path of the image to resample.
    output_path : str or Path
        Relative or absolute path of the output image.
    timestamp : str, datetime.datetime or pd.Timestamp
        New timestamp to write to the image's metadata.
    offset : DateOffset or Timedelta
        Offset or Timedelta to add to (if positive) or subtract from (if
        negative) the original image's timestamp. This argument only has
         effect when no timestamp is specified.

    Returns
    -------
    None

    """
    if not isinstance(image_path, pathlib.Path):
        image_path = pathlib.Path(image_path)
    if not isinstance(output_path, pathlib.Path):
        output_path = pathlib.Path(output_path)

    image = Image.open(image_path.as_posix())
    exif = image.getexif()

    if timestamp is not None:
        if not isinstance(timestamp, pd.Timestamp):
            timestamp = pd.Timestamp(timestamp)
    else:
        timestamp = exif[_get_exif_code("DateTime")]
        timestamp = pd.Timestamp(timestamp.replace(":", "-", 2))
        timestamp += offset

    exif[_get_exif_code("DateTime")] = timestamp.strftime("%Y:%m:%d %H:%M:%S")
    exif[_get_exif_code("DateTimeOriginal")] = timestamp.strftime("%Y:%m:%d %H:%M:%S")

    image.save(output_path.as_posix(), format=image.format, exif=exif)

convert_video_to_images(video_path, output_path, timestamp=None, image_format='jpeg', offset=None)

Converts a video to images with an associated timestamp.

Parameters:

Name Type Description Default
video_path str or Path

Relative or absolute path of the video to convert.

required
output_path str or Path

Relative or absolute path of the folder to save the images to. If the folder does not exist, it will be created.

required
timestamp str, datetime.datetime or pd.Timestamp

Timestamp of the beginning of the video. If no timestamp is provided, it will be automatically extracted from the metadata.

None
image_format str

Image format of the output images. Possible values are:

  • 'jpeg'
  • 'png'
'jpeg'
offset int

Offset (in seconds) to convert frames to images. For example, if offset is 1, the output images will correspond to 1 second-separated frames of the video. If offset is None, all the frames in the video will be converted to images.

None

Returns:

Type Description
None
Source code in wiutils/preprocessing.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def convert_video_to_images(
    video_path: Union[str, pathlib.Path],
    output_path: Union[str, pathlib.Path],
    timestamp: Union[str, datetime.datetime, pd.Timestamp] = None,
    image_format: str = "jpeg",
    offset: int = None,
) -> None:
    """
    Converts a video to images with an associated timestamp.

    Parameters
    ----------
    video_path : str or Path
        Relative or absolute path of the video to convert.
    output_path : str or Path
        Relative or absolute path of the folder to save the images to. If
        the folder does not exist, it will be created.
    timestamp : str, datetime.datetime or pd.Timestamp
        Timestamp of the beginning of the video. If no timestamp is
        provided, it will be automatically extracted from the metadata.
    image_format : str
        Image format of the output images. Possible values are:

            - 'jpeg'
            - 'png'
    offset : int
        Offset (in seconds) to convert frames to images. For example, if
        offset is 1, the output images will correspond to 1 second-separated
        frames of the video. If offset is None, all the frames in the video
        will be converted to images.

    Returns
    -------
    None

    """
    if not isinstance(video_path, pathlib.Path):
        video_path = pathlib.Path(video_path)
    if not isinstance(output_path, pathlib.Path):
        output_path = pathlib.Path(output_path)

    if image_format not in ("jpeg", "png"):
        raise ValueError("image_format must be one of ['jpeg', 'png'].")

    ext = "jpg" if image_format == "jpeg" else image_format

    if timestamp is not None:
        start = pd.Timestamp(timestamp)
    else:
        info = ffmpeg.probe(video_path.as_posix())
        try:
            start = info["format"]["tags"]["creation_time"]
        except KeyError:
            raise KeyError(f"{video_path.as_posix()} does not have a creation date.")
        start = pd.Timestamp(start)

    video = cv2.VideoCapture(video_path.as_posix())
    frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    width = len(str(frames))
    datetime_code = _get_exif_code("DateTimeOriginal")

    output_path.mkdir(parents=True, exist_ok=True)

    count = 1
    flag, arr = video.read()
    while flag:
        image = Image.fromarray(cv2.cvtColor(arr, cv2.COLOR_RGB2BGR))
        exif = image.getexif()
        timestamp = start + pd.Timedelta(milliseconds=video.get(cv2.CAP_PROP_POS_MSEC))
        exif[datetime_code] = timestamp.strftime("%Y:%m:%d %H:%M:%S")
        name = video_path.stem + "_" + str(count).zfill(width) + f".{ext}"
        image.save(
            output_path.joinpath(name).as_posix(), format=image_format, exif=exif
        )
        if offset:
            video.set(cv2.CAP_PROP_POS_MSEC, count * (offset * 1e3))
        flag, arr = video.read()
        count += 1

reduce_image_size(image_path, output_path, factor=0.9, method=1)

Reduces image file size by resampling using a given factor.

Parameters:

Name Type Description Default
image_path str or Path

Relative or absolute path of the image to resample.

required
output_path str or Path

Relative or absolute path of the output image.

required
factor float

Resampling factor.

0.9
method int

Image resizing method used by PIL. Possible values are:

  • 0: PIL.Image.Resampling.NEAREST
  • 1: PIL.Image.Resampling.LANCZOS
  • 2: PIL.Image.Resampling.BILINEAR
  • 3: PIL.Image.Resampling.BICUBIC
1

Returns:

Type Description
None
Source code in wiutils/preprocessing.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def reduce_image_size(
    image_path: Union[str, pathlib.Path],
    output_path: Union[str, pathlib.Path],
    factor: float = 0.9,
    method: int = 1,
) -> None:
    """
    Reduces image file size by resampling using a given factor.

    Parameters
    ----------
    image_path : str or Path
        Relative or absolute path of the image to resample.
    output_path : str or Path
        Relative or absolute path of the output image.
    factor : float
        Resampling factor.
    method : int
        Image resizing method used by PIL. Possible values are:

            - 0: PIL.Image.Resampling.NEAREST
            - 1: PIL.Image.Resampling.LANCZOS
            - 2: PIL.Image.Resampling.BILINEAR
            - 3: PIL.Image.Resampling.BICUBIC

    Returns
    -------
    None

    """
    if not isinstance(image_path, pathlib.Path):
        image_path = pathlib.Path(image_path)
    if not isinstance(output_path, pathlib.Path):
        output_path = pathlib.Path(output_path)

    image = Image.open(image_path.as_posix())
    exif = image.getexif()
    new_width = round(image.width * factor)
    new_height = round(image.height * factor)
    result = image.resize((new_width, new_height), method)

    result.save(output_path.as_posix(), format=image.format, exif=exif)

wiutils.plotting

Functions to plot information from the images and deployments tables.

plot_activity_hours(images, names, kind='kde', polar=False, hist_kws=None, kde_kws=None, polar_kws=None)

Plots the activity hours of one or multiple taxa by grouping all observations into a 24-hour range.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
names list, str or Series

List of names to plot activity hours for.

required
kind str

Type of plot. Values can be:

  • 'hist' for histogram.
  • 'kde' for kernel density estimate plot.
'kde'
polar bool

Whether to use a polar (i.e. circular projection) for the plot. If polar is True, kind must be one of 'area' or 'hist'. Otherwise it must be one of 'hist' or 'kde'.

False
hist_kws dict

Keyword arguments passed to the seaborn.histplot() function. Only has effect if kind is 'hist' and polar is False.

None
kde_kws dict

Keyword arguments passed to the seaborn.kde() function. Only has effect if kind is 'kde'.

None
polar_kws dict

Keyword arguments passed to a local function when polar is True, regardless of kind. Possible arguments are:

  • 'density': True or False. Whether to compute density or counts. Default is False.
  • 'fill': True or False. Whether to fill the area under the line (when kind is 'area') or the rectangles (when kind is 'hist'). Default is True.
None

Returns:

Type Description
Axes

Plot axes.

Source code in wiutils/plotting.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
@mpl.rc_context(fname=CONFIG_FILE)
def plot_activity_hours(
    images: pd.DataFrame,
    names: Union[list, str, pd.Series],
    kind: str = "kde",
    polar: bool = False,
    hist_kws: dict = None,
    kde_kws: dict = None,
    polar_kws: dict = None,
) -> Union[plt.Axes, plt.PolarAxes]:
    """
    Plots the activity hours of one or multiple taxa by grouping all
    observations into a 24-hour range.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    names : list, str or Series
        List of names to plot activity hours for.
    kind : str
        Type of plot. Values can be:

        - 'hist' for histogram.
        - 'kde' for kernel density estimate plot.
    polar : bool
        Whether to use a polar (i.e. circular projection) for the plot.
        If polar is True, kind must be one of 'area' or 'hist'. Otherwise
        it must be one of 'hist' or 'kde'.
    hist_kws : dict
        Keyword arguments passed to the seaborn.histplot() function. Only
        has effect if kind is 'hist' and polar is False.
    kde_kws : dict
        Keyword arguments passed to the seaborn.kde() function. Only
        has effect if kind is 'kde'.
    polar_kws : dict
        Keyword arguments passed to a local function when polar is True,
        regardless of kind. Possible arguments are:

            - 'density': True or False. Whether to compute density or
            counts. Default is False.
            - 'fill': True or False. Whether to fill the area under the
            line (when kind is 'area') or the rectangles (when kind is
            'hist'). Default is True.

    Returns
    -------
    Axes
        Plot axes.

    """
    if isinstance(names, str):
        names = [names]

    if hist_kws is None:
        hist_kws = {}
    if kde_kws is None:
        kde_kws = {}
    if polar_kws is None:
        polar_kws = {}

    taxa = get_lowest_taxon(images, return_rank=False)
    inconsistent_names = set(names) - set(taxa)
    if len(inconsistent_names):
        raise ValueError(f"{list(inconsistent_names)} were not found in images.")

    images = images.copy()
    images["taxon"] = taxa
    images = images.loc[images["taxon"].isin(names), :].reset_index(drop=True)
    images[_labels.images.date] = pd.to_datetime(images[_labels.images.date])
    images["hour"] = images[_labels.images.date].dt.hour + (
        images[_labels.images.date].dt.minute / 60
    )

    # Each image is duplicated by its number of objects to properly
    # account for those images with more than one animal.
    images = images.loc[
        images.index.repeat(images[_labels.images.objects])
    ].reset_index(drop=True)

    if polar:
        if kind in ("area", "hist"):
            ax = _plot_polar(images, "hour", hue="taxon", kind=kind, **polar_kws)
        elif kind == "kde":
            raise ValueError("kind cannot be 'kde' when polar=True.")
        else:
            raise ValueError("kind must be one of ['area', 'hist']")

        ax.set_theta_direction(-1)
        ax.set_theta_zero_location("N")
        x_labels = [f"{h:02}:00" for h in np.arange(0, 24, 2)]
        plt.thetagrids(np.arange(0, 360, 360 // 12), x_labels)

    else:
        images = images[["taxon", "hour"]]
        if kind == "area":
            raise ValueError("kind cannot be 'area' when polar=False.")
        elif kind == "hist":
            ax = sns.histplot(
                data=images,
                x="hour",
                hue="taxon",
                binwidth=1,
                binrange=(0, 24),
                discrete=False,
                **hist_kws,
            )
        elif kind == "kde":
            ax = sns.kdeplot(data=images, x="hour", hue="taxon", **kde_kws)
        else:
            raise ValueError("kind must be one of ['hist', 'kde']")

        x_ticks = np.arange(0, 26, 2)
        x_labels = [f"{h:02}:00" for h in x_ticks]
        ax.set_xlim(-2, 26)
        ax.set_xticks(x_ticks, labels=x_labels)

    return ax

plot_date_ranges(images=None, deployments=None, source='both', **kwargs)

Plots deployment date ranges.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

None
deployments DataFrame

DataFrame with the project's deployments.

None
source bool

Source to plot date ranges from: Values can be:

  • 'images' to plot date ranges from images (i.e. first image to last image taken).
  • 'deployments' to plot date ranges from deployments information (i.e. start date and end date).
  • 'both' to plot both sources in two different subplots.
'both'

kwargs Keyword arguments passed to the sns.relplot() function.

Returns:

Type Description
Axes

Plot axes.

Source code in wiutils/plotting.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
@mpl.rc_context(fname=CONFIG_FILE)
def plot_date_ranges(
    images: pd.DataFrame = None,
    deployments: pd.DataFrame = None,
    source: str = "both",
    **kwargs,
) -> plt.Axes:
    """
    Plots deployment date ranges.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    deployments : DataFrame
        DataFrame with the project's deployments.
    source : bool
        Source to plot date ranges from: Values can be:

            - 'images' to plot date ranges from images (i.e. first image
            to last image taken).
            - 'deployments' to plot date ranges from deployments
            information (i.e. start date and end date).
            - 'both' to plot both sources in two different subplots.

    kwargs
        Keyword arguments passed to the sns.relplot() function.

    Returns
    -------
    Axes
        Plot axes.

    """
    df = get_date_ranges(
        images,
        deployments,
        source,
        compute_delta=False,
        pivot=False,
    )

    df = pd.melt(
        df,
        id_vars=[_labels.deployments.deployment_id, "source"],
        value_vars=[_labels.deployments.start, _labels.deployments.end],
    )
    df = df.rename(columns={"value": "date"})
    df = df.sort_values("date").reset_index(drop=True)

    g = sns.relplot(
        data=df,
        x="date",
        y=_labels.deployments.deployment_id,
        row="source",
        kind="line",
        units=_labels.deployments.deployment_id,
        estimator=None,
        facet_kws=dict(despine=False),
        **kwargs,
    )

    return g.axes

plot_detection_history(images, deployments, name, mask=False, compute_detection_history_kws=None, heatmap_kws=None)

Plots detection history matrix for a given species.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
deployments DataFrame

DataFrame with the project's deployments.

required
name str

Scientific name of the species to plot the detection history for.

required
mask bool

Whether to mask cells where cameras were not functioning. If True, those cells won't be displayed. Otherwise, they will be displayed as zero.

False
compute_detection_history_kws dict

Keyword arguments for the wiutils.compute_detection_history() function.

None
heatmap_kws dict

Keyword arguments for the seaborn.heatmap() function.

None

Returns:

Type Description
Axes

Plot axes.

Source code in wiutils/plotting.py
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
@mpl.rc_context(fname=CONFIG_FILE)
def plot_detection_history(
    images: pd.DataFrame,
    deployments: pd.DataFrame,
    name: str,
    mask: bool = False,
    compute_detection_history_kws: dict = None,
    heatmap_kws: dict = None,
) -> plt.Axes:
    """
    Plots detection history matrix for a given species.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    deployments : DataFrame
        DataFrame with the project's deployments.
    name : str
        Scientific name of the species to plot the detection history for.
    mask : bool
        Whether to mask cells where cameras were not functioning. If True,
        those cells won't be displayed. Otherwise, they will be displayed
        as zero.
    compute_detection_history_kws : dict
        Keyword arguments for the wiutils.compute_detection_history()
        function.
    heatmap_kws : dict
        Keyword arguments for the seaborn.heatmap() function.

    Returns
    -------
    Axes
        Plot axes.

    """
    if compute_detection_history_kws is None:
        compute_detection_history_kws = {}
    if heatmap_kws is None:
        heatmap_kws = {}

    taxa = get_lowest_taxon(images, return_rank=False)
    if name not in taxa.unique():
        raise ValueError(f"{name} was not found in images.")

    result = compute_detection_history(
        images, deployments, pivot=True, **compute_detection_history_kws
    )
    result = result[result["taxon"] == name]
    result = result.drop(columns="taxon")
    result = result.set_index(_labels.images.deployment_id)

    if not mask:
        result = result.fillna(0)

    ax = sns.heatmap(data=result, **heatmap_kws)

    return ax

wiutils.reading

Functions to read information from WI projects.

load_demo(name)

Loads the cameras, deployments, images and projects tables from a demo dataset.

Parameters:

Name Type Description Default
name str

Demo dataset name. Can be one of:

  • 'cajambre'
  • 'cristales'
required

Returns:

Type Description
DataFrame

Demo cameras dataframe

DataFrame

Demo deployments dataframe

DataFrame

Demo images dataframe

DataFrame

Demo projects dataframe

Source code in wiutils/reading.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def load_demo(name) -> tuple:
    """
    Loads the cameras, deployments, images and projects tables from a
    demo dataset.

    Parameters
    ----------
    name : str
        Demo dataset name. Can be one of:

            - 'cajambre'
            - 'cristales'

    Returns
    -------
    DataFrame
        Demo cameras dataframe
    DataFrame
        Demo deployments dataframe
    DataFrame
        Demo images dataframe
    DataFrame
        Demo projects dataframe

    """
    root = pathlib.Path(__file__).parents[0]
    if name == "cajambre":
        path = root.joinpath("data/cajambre.zip")
    elif name == "cristales":
        path = root.joinpath("data/cristales.zip")
    else:
        raise ValueError("name must be of one ['cajambre', 'cristales']")

    return read_bundle(path)

read_bundle(path)

Reads the cameras, deployments, images and projects tables from a specific Wildlife Insights project bundle.

Parameters:

Name Type Description Default
path str or Path

Absolute or relative path of the project bundle. Can be a folder with all the respective csv files inside or a zip file.

required

Returns:

Type Description
DataFrame

Bundle cameras dataframe

DataFrame

Bundle deployments dataframe

DataFrame

Bundle images dataframe

DataFrame

Bundle projects dataframe

Source code in wiutils/reading.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def read_bundle(path: Union[str, pathlib.Path]) -> tuple:
    """
    Reads the cameras, deployments, images and projects tables from a
    specific Wildlife Insights project bundle.

    Parameters
    ----------
    path : str or Path
        Absolute or relative path of the project bundle. Can be a folder
        with all the respective csv files inside or a zip file.

    Returns
    -------
    DataFrame
        Bundle cameras dataframe
    DataFrame
        Bundle deployments dataframe
    DataFrame
        Bundle images dataframe
    DataFrame
        Bundle projects dataframe

    """
    cameras = read_cameras(path)
    deployments = read_deployments(path)
    images = read_images(path)
    projects = read_projects(path)

    return cameras, deployments, images, projects

read_cameras(path, **kwargs)

Reads the cameras' table from a specific Wildlife Insights project bundle.

Parameters:

Name Type Description Default
path str or Path

Absolute or relative path of the project bundle. Can be a folder with all the respective csv files inside or a zip file.

required
kwargs

Keyword arguments passed to the pd.read_csv function.

required

Returns:

Type Description
DataFrame

Bundle cameras dataframe

Source code in wiutils/reading.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def read_cameras(path: Union[str, pathlib.Path], **kwargs) -> pd.DataFrame:
    """
    Reads the cameras' table from a specific Wildlife Insights project bundle.

    Parameters
    ----------
    path : str or Path
        Absolute or relative path of the project bundle. Can be a folder
        with all the respective csv files inside or a zip file.
    kwargs
        Keyword arguments passed to the pd.read_csv function.

    Returns
    -------
    DataFrame
        Bundle cameras dataframe

    """
    return _read_file(path, "cameras", **kwargs)

read_deployments(path, **kwargs)

Reads the deployments' table from a specific Wildlife Insights project bundle. Start and end column values are automatically parsed as dates.

Parameters:

Name Type Description Default
path str or Path

Absolute or relative path of the project bundle. Can be a folder with all the respective csv files inside or a zip file.

required
kwargs

Keyword arguments passed to the pd.read_csv function.

required

Returns:

Type Description
DataFrame

Bundle deployments dataframe

Source code in wiutils/reading.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def read_deployments(path: Union[str, pathlib.Path], **kwargs) -> pd.DataFrame:
    """
    Reads the deployments' table from a specific Wildlife Insights project
    bundle. Start and end column values are automatically parsed as dates.

    Parameters
    ----------
    path : str or Path
        Absolute or relative path of the project bundle. Can be a folder
        with all the respective csv files inside or a zip file.
    kwargs
        Keyword arguments passed to the pd.read_csv function.

    Returns
    -------
    DataFrame
        Bundle deployments dataframe

    """
    kwargs.update(
        dict(parse_dates=[_labels.deployments.start, _labels.deployments.end])
    )
    return _read_file(path, "deployments", **kwargs)

read_images(path, **kwargs)

Reads the images' table from a specific Wildlife Insights project bundle. Timestamp column values are automatically parsed as dates.

Parameters:

Name Type Description Default
path str or Path

Absolute or relative path of the project bundle. Can be a folder with all the respective csv files inside or a zip file.

required
kwargs

Keyword arguments passed to the pd.read_csv function.

required

Returns:

Type Description
DataFrame

Bundle images dataframe

Source code in wiutils/reading.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def read_images(path: Union[str, pathlib.Path], **kwargs) -> pd.DataFrame:
    """
    Reads the images' table from a specific Wildlife Insights project
    bundle. Timestamp column values are automatically parsed as dates.

    Parameters
    ----------
    path : str or Path
        Absolute or relative path of the project bundle. Can be a folder
        with all the respective csv files inside or a zip file.
    kwargs
        Keyword arguments passed to the pd.read_csv function.

    Returns
    -------
    DataFrame
        Bundle images dataframe

    """
    kwargs.update(dict(parse_dates=[_labels.images.date]))
    return _read_file(path, "images", **kwargs)

read_projects(path, **kwargs)

Reads projects table from a specific Wildlife Insights project bundle.

Parameters:

Name Type Description Default
path str or Path

Absolute or relative path of the project bundle. Can be a folder with all the respective csv files inside or a zip file.

required
kwargs

Keyword arguments passed to the pd.read_csv function.

required

Returns:

Type Description
DataFrame

Bundle projects dataframe

Source code in wiutils/reading.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def read_projects(path: Union[str, pathlib.Path], **kwargs) -> pd.DataFrame:
    """
    Reads projects table from a specific Wildlife Insights project bundle.

    Parameters
    ----------
    path : str or Path
        Absolute or relative path of the project bundle. Can be a folder
        with all the respective csv files inside or a zip file.
    kwargs
        Keyword arguments passed to the pd.read_csv function.

    Returns
    -------
    DataFrame
        Bundle projects dataframe

    """
    return _read_file(path, "projects", **kwargs)

wiutils.summarizing

Functions to create new tables or modify existing ones from WI data.

compute_count_summary(images, deployments=None, groupby='deployment', add_records_by_class=False, add_taxa_by_class=False, remove_unidentified_kws=None, remove_duplicates_kws=None)

Computes a summary of images, records and taxa count by deployment.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
deployments DataFrame

DataFrame with the project's deployments. Must be passed only if groupby is 'location'.

None
groupby str

Level to group results by. Can be one of:

  • 'deployment' to group by deployment (deployment_id)
  • 'location' to group by location (placename)
'deployment'
add_records_by_class bool

Whether to add number of independent records (i.e. number of individuals after duplicate image removal).

False
add_taxa_by_class bool

Whether to add number of unique taxa.

False
remove_unidentified_kws dict

Keyword arguments for the wiutils.remove_unidentified function.

None
remove_duplicates_kws dict

Keyword arguments for the wiutils.remove_duplicates function.

None

Returns:

Type Description
DataFrame

Summary of images, records and species count by deployment.

Source code in wiutils/summarizing.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def compute_count_summary(
    images: pd.DataFrame,
    deployments: pd.DataFrame = None,
    groupby: str = "deployment",
    add_records_by_class: bool = False,
    add_taxa_by_class: bool = False,
    remove_unidentified_kws: dict = None,
    remove_duplicates_kws: dict = None,
) -> pd.DataFrame:
    """
    Computes a summary of images, records and taxa count by deployment.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    deployments : DataFrame
        DataFrame with the project's deployments. Must be passed only if
        groupby is 'location'.
    groupby : str
        Level to group results by. Can be one of:

            - 'deployment' to group by deployment (deployment_id)
            - 'location' to group by location (placename)
    add_records_by_class : bool
        Whether to add number of independent records (i.e. number of
        individuals after duplicate image removal).
    add_taxa_by_class : bool
        Whether to add number of unique taxa.
    remove_unidentified_kws : dict
        Keyword arguments for the wiutils.remove_unidentified function.
    remove_duplicates_kws : dict
        Keyword arguments for the wiutils.remove_duplicates function.

    Returns
    -------
    DataFrame
        Summary of images, records and species count by deployment.

    """
    images = images.copy()

    if remove_unidentified_kws is None:
        remove_unidentified_kws = {"rank": "class"}
    if remove_duplicates_kws is None:
        remove_duplicates_kws = {}

    remove_unidentified_kws.update({"reset_index": True})
    remove_duplicates_kws.update({"reset_index": True})

    images, groupby_label = _process_groupby_arg(images, deployments, groupby)
    result = pd.DataFrame(index=sorted(images[groupby_label].unique()))
    result = result.join(images.groupby(groupby_label).size().rename("total_images"))
    images = remove_unidentified(images, **remove_unidentified_kws)
    result = result.join(
        images.groupby(groupby_label).size().rename("identified_images")
    )
    images = remove_duplicates(images, **remove_duplicates_kws)

    result = result.join(
        images.groupby(groupby_label)[_labels.images.objects].sum().rename("records")
    )
    if add_records_by_class:
        classes = images[_labels.images.class_].dropna().unique()
        for class_ in classes:
            subset = images[images[_labels.images.class_] == class_]
            result = result.join(
                subset.groupby(groupby_label)[_labels.images.objects]
                .sum()
                .rename(f"records_{class_.lower()}")
            )

    images["taxon"] = get_lowest_taxon(images, return_rank=False)
    result = result.join(
        images.groupby(groupby_label)["taxon"].nunique().rename("taxa")
    )
    if add_taxa_by_class:
        classes = images[_labels.images.class_].dropna().unique()
        for class_ in classes:
            subset = images[images[_labels.images.class_] == class_]
            result = result.join(
                subset.groupby(groupby_label)["taxon"]
                .nunique()
                .rename(f"taxa_{class_.lower()}")
            )

    result.index.name = groupby_label
    result = result.reset_index()
    result.iloc[:, 1:] = result.iloc[:, 1:].fillna(0).astype(int)

    return result

compute_detection(images, deployments=None, groupby='deployment', compute_abundance=True, pivot=False)

Computes the detection (in terms of abundance or presence) of each taxon by deployment.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
deployments DataFrame

DataFrame with the project's deployments. Must be passed only if groupby is 'location'.

None
groupby str

Level to group results by. Can be one of:

  • 'deployment' to group by deployment (deployment_id)
  • 'location' to group by location (placename)
'deployment'
compute_abundance bool

Whether to compute the abundance for each deployment. If False, returns presence/absence for the deployments.

True
pivot bool

Whether to pivot (reshape from long to wide format) the resulting DataFrame.

False

Returns:

Type Description
DataFrame

DataFrame with the detection of each species by deployment.

Source code in wiutils/summarizing.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def compute_detection(
    images: pd.DataFrame,
    deployments: pd.DataFrame = None,
    groupby: str = "deployment",
    compute_abundance: bool = True,
    pivot: bool = False,
):
    """
    Computes the detection (in terms of abundance or presence) of each
    taxon by deployment.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    deployments : DataFrame
        DataFrame with the project's deployments. Must be passed only if
        groupby is 'location'.
    groupby : str
        Level to group results by. Can be one of:

            - 'deployment' to group by deployment (deployment_id)
            - 'location' to group by location (placename)
    compute_abundance : bool
        Whether to compute the abundance for each deployment. If False,
        returns presence/absence for the deployments.
    pivot : bool
        Whether to pivot (reshape from long to wide format) the resulting
        DataFrame.

    Returns
    -------
    DataFrame
        DataFrame with the detection of each species by deployment.

    """
    images = images.copy()
    images = remove_unidentified(images, rank="class", reset_index=True)

    images, groupby_label = _process_groupby_arg(images, deployments, groupby)
    images["taxon"] = get_lowest_taxon(images, return_rank=False)
    result = images.groupby(["taxon", groupby_label])[_labels.images.objects].sum()
    taxa = images["taxon"].unique()
    sites = images[groupby_label].unique()
    idx = pd.MultiIndex.from_product([taxa, sites], names=["taxon", groupby_label])
    result = result.reindex(idx, fill_value=0)
    result.name = "value"
    result = result.reset_index()

    if not compute_abundance:
        has_observations = result["value"] > 0
        result.loc[has_observations, "value"] = 1

    result = result.sort_values(["taxon", groupby_label], ignore_index=True)

    if pivot:
        result = result.pivot(index="taxon", columns=groupby_label, values="value")
        result = result.rename_axis(None, axis=1).reset_index()

    return result

compute_detection_history(images, deployments, date_range='deployments', days=1, compute_abundance=True, pivot=False)

Computes the detection history (in terms of abundance or presence) by taxon and deployment, grouping observations into specific days-long intervals.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
deployments DataFrame

DataFrame with the project's deployments.

required
date_range str

Table to compute the date range from. Possible values are:

  • 'deployments'
  • 'images'
'deployments'
days int

Days interval to group observations into.

1
compute_abundance bool

Whether to compute the abundance for each interval. If False, returns presence/absence for the intervals.

True
pivot bool

Whether to pivot (reshape from long to wide format) the resulting DataFrame.

False

Returns:

Type Description
DataFrame

Detection history.

Source code in wiutils/summarizing.py
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
def compute_detection_history(
    images: pd.DataFrame,
    deployments: pd.DataFrame,
    date_range: str = "deployments",
    days: int = 1,
    compute_abundance: bool = True,
    pivot: bool = False,
) -> pd.DataFrame:
    """
    Computes the detection history (in terms of abundance or presence) by
    taxon and deployment, grouping observations into specific days-long
    intervals.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    deployments : DataFrame
        DataFrame with the project's deployments.
    date_range : str
        Table to compute the date range from. Possible values are:

            - 'deployments'
            - 'images'
    days : int
        Days interval to group observations into.
    compute_abundance : bool
        Whether to compute the abundance for each interval. If False,
        returns presence/absence for the intervals.
    pivot : bool
        Whether to pivot (reshape from long to wide format) the resulting
        DataFrame.

    Returns
    -------
    DataFrame
        Detection history.

    """
    images = images.copy()
    deployments = deployments.copy()

    images = remove_unidentified(images, rank="class", reset_index=True)

    images[_labels.images.date] = pd.to_datetime(images[_labels.images.date])
    images[_labels.images.date] = pd.to_datetime(images[_labels.images.date].dt.date)
    deployments[_labels.deployments.start] = pd.to_datetime(
        deployments[_labels.deployments.start]
    )
    deployments[_labels.deployments.end] = pd.to_datetime(
        deployments[_labels.deployments.end]
    )
    if date_range == "deployments":
        start = deployments[_labels.deployments.start].min()
        end = deployments[_labels.deployments.end].max()
    elif date_range == "images":
        start = images[_labels.images.date].min()
        end = images[_labels.images.date].max()
    else:
        raise ValueError("date_range must be one of ['deployments', 'images'].")

    images["taxon"] = get_lowest_taxon(images, return_rank=False)
    freq = pd.Timedelta(days=days)
    groupers = [
        pd.Grouper(key="taxon"),
        pd.Grouper(key=_labels.images.deployment_id),
        pd.Grouper(key=_labels.images.date, freq=freq, origin=start),
    ]
    result = images.groupby(groupers)[_labels.images.objects].sum()

    # A new index with all the combinations of species, sites and dates
    # is created to reindex the result and to assign zeros where there
    # were no observations.
    species = images["taxon"].unique()
    sites = images[_labels.images.deployment_id].unique()
    dates = pd.date_range(start, end, freq=freq)
    idx = pd.MultiIndex.from_product(
        [species, sites, dates],
        names=["taxon", _labels.images.deployment_id, _labels.images.date],
    )
    result = result.reindex(idx, fill_value=0)
    result.name = "value"
    result = result.reset_index()

    if not compute_abundance:
        has_observations = result["value"] > 0
        result.loc[has_observations, "value"] = 1

    # Groups (i.e. days intervals) where the corresponding camera was not
    # deployed at the time are assigned NaNs.
    result = pd.merge(
        result,
        deployments[
            [
                _labels.images.deployment_id,
                _labels.deployments.start,
                _labels.deployments.end,
            ]
        ],
        on=_labels.images.deployment_id,
        how="left",
    )
    group_start = result[_labels.images.date]
    group_end = result[_labels.images.date] + pd.Timedelta(days=days - 1)
    inside_range_left = group_start.between(
        result[_labels.deployments.start], result[_labels.deployments.end]
    )
    inside_range_right = group_end.between(
        result[_labels.deployments.start], result[_labels.deployments.end]
    )
    inside_range = inside_range_left | inside_range_right
    result.loc[~inside_range, "value"] = np.nan
    result = result.drop(columns=[_labels.deployments.start, _labels.deployments.end])

    result = result.sort_values(
        ["taxon", _labels.images.deployment_id, _labels.images.date], ignore_index=True
    )

    if pivot:
        result[_labels.images.date] = result[_labels.images.date].astype(str)
        result = result.pivot(
            index=["taxon", _labels.images.deployment_id],
            columns=_labels.images.date,
            values="value",
        )
        result = result.rename_axis(None, axis=1).reset_index()

    return result

compute_general_count(images, deployments=None, groupby='deployment', add_taxonomy=False, rank='class')

Computes the general abundance and number of deployments for each taxon.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
deployments DataFrame

DataFrame with the project's deployments. Must be passed only if groupby is 'location'.

None
groupby str

Level to group results by. Can be one of:

  • 'deployment' to group by deployment (deployment_id)
  • 'location' to group by location (placename)
'deployment'
add_taxonomy bool

Whether to add the superior taxonomy of the species to the result.

False
rank str

Upper taxonomic rank to extract classification for. Possible values are:

  • 'epithet'
  • 'genus'
  • 'family'
  • 'order'
  • 'class' For example, if rank is 'family', the result will have the corresponding family (and therefore the inferior ranks - genus and epithet -) were not identified will be removed.
'class'

Returns:

Type Description
DataFrame

DataFrame with abundance and number of deployments by species.

Source code in wiutils/summarizing.py
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
def compute_general_count(
    images: pd.DataFrame,
    deployments: pd.DataFrame = None,
    groupby: str = "deployment",
    add_taxonomy: bool = False,
    rank: str = "class",
):
    """
    Computes the general abundance and number of deployments for each
    taxon.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    deployments : DataFrame
        DataFrame with the project's deployments. Must be passed only if
        groupby is 'location'.
    groupby : str
        Level to group results by. Can be one of:

            - 'deployment' to group by deployment (deployment_id)
            - 'location' to group by location (placename)
    add_taxonomy : bool
        Whether to add the superior taxonomy of the species to the result.
    rank : str
        Upper taxonomic rank to extract classification for. Possible
        values are:

            - 'epithet'
            - 'genus'
            - 'family'
            - 'order'
            - 'class'
        For example, if rank is 'family', the result will have the
        corresponding family (and therefore the inferior ranks - genus
        and epithet -) were not identified will be removed.

    Returns
    -------
    DataFrame
        DataFrame with abundance and number of deployments by species.

    """
    images = images.copy()

    images, groupby_label = _process_groupby_arg(images, deployments, groupby)
    images["taxon"] = get_lowest_taxon(images, return_rank=False)
    result = images.groupby("taxon").agg(
        {_labels.images.objects: "sum", groupby_label: "nunique"}
    )
    result = result.rename(
        columns={_labels.images.objects: "n", groupby_label: f"{groupby}s"}
    )
    result = result.reset_index()

    if add_taxonomy:
        taxonomy_columns = _utils.taxonomy.get_taxonomy_columns(rank)
        taxonomy = images[["taxon", *taxonomy_columns]].drop_duplicates("taxon")
        result = pd.merge(result, taxonomy, on="taxon", how="left")

    return result

compute_hill_numbers(images, deployments=None, groupby='deployment', q_values=(0, 1, 2), pivot=False)

Computes the Hill numbers of order q (also called effective number of species) by site for some given values of q.

Parameters:

Name Type Description Default
images DataFrame

DataFrame with the project's images.

required
deployments DataFrame

DataFrame with the project's deployments. Must be passed only if groupby is 'location'.

None
groupby str

Level to group results by. Can be one of:

  • 'deployment' to group by deployment (deployment_id)
  • 'location' to group by location (placename)
'deployment'
q_values int, list, tuple or array

Value(s) of q to compute Hill numbers for.

(0, 1, 2)
pivot bool

Whether to pivot (reshape from long to wide format) the resulting DataFrame.

False

Returns:

Type Description
DataFrame

Computed Hill numbers by deployment.

Source code in wiutils/summarizing.py
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
def compute_hill_numbers(
    images: pd.DataFrame,
    deployments: pd.DataFrame = None,
    groupby: str = "deployment",
    q_values: Union[int, list, tuple, np.ndarray] = (0, 1, 2),
    pivot: bool = False,
) -> pd.DataFrame:
    """
    Computes the Hill numbers of order q (also called effective number of
    species) by site for some given values of q.

    Parameters
    ----------
    images : DataFrame
        DataFrame with the project's images.
    deployments : DataFrame
        DataFrame with the project's deployments. Must be passed only if
        groupby is 'location'.
    groupby : str
        Level to group results by. Can be one of:

            - 'deployment' to group by deployment (deployment_id)
            - 'location' to group by location (placename)
    q_values : int, list, tuple or array
        Value(s) of q to compute Hill numbers for.
    pivot : bool
        Whether to pivot (reshape from long to wide format) the resulting
        DataFrame.

    Returns
    -------
    DataFrame
        Computed Hill numbers by deployment.

    """
    images = images.copy()

    if isinstance(q_values, int):
        q_values = [q_values]

    result = []

    images, groupby_label = _process_groupby_arg(images, deployments, groupby)
    images["taxon"] = get_lowest_taxon(images, return_rank=False)
    abundance = images.groupby([groupby_label, "taxon"])[_labels.images.objects].sum()
    relative_abundance = abundance / abundance.groupby(level=0).sum()
    for site, group in relative_abundance.groupby(level=0):
        for q in q_values:
            row = {
                groupby_label: site,
                "q": q,
                "D": _compute_q_diversity_index(group.to_numpy(), q),
            }
            result.append(row)

    result = pd.DataFrame(result)

    if pivot:
        result["q"] = result["q"].astype(str)
        result = result.pivot(index=groupby_label, columns="q", values="D")
        result = result.rename_axis(None, axis=1).reset_index()

    return result