Source code for flagit.flagit

# The MIT License (MIT)
#
# Copyright (c) 2020 TU Wien
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import pandas as pd
import numpy as np
from scipy.signal import savgol_filter as savgol
from functools import reduce
import warnings
from flagit.settings import Variables


[docs]class FormatError(Exception):
    pass

[docs]class VariableNotKnown(Exception):
    pass

t = Variables()


[docs]class Interface(object):
    """
    class provides interface to apply ISMN quality control procedures to in situ soil moisture data.

    upon initialization it checks if the provided DataFrame has the required format. Quality control procedures can then
    be applied using the Interface.run function. The flags are provided as additional tags in column "qflag" which is
    one of three main categories: C (exceeding plausible geophysical range), D (questionable/dubious) or G (good).
    For a detailed description of the algorithms please see: Dorigo, W. A., Xaver, A., Vreugdenhil, M., Gruber, A.,
    Hegyiova, A., Sanchis-Dufau, A. D., ... & Drusch, M. (2013). Global automated quality control of in situ soil
    moisture data from the International Soil Moisture Network. Vadose Zone Journal, 12(3), doi:10.2136/vzj2012.0097.

    If variable is not soil moisture but one of the following list:
    [soil temperature, air temperature, precipitation, soil suction, snow water equivalent, snow depth,
    soil surface temperature] then for the ISMN quality control only threshold flags (c01 and c02) are applied.

    It is required that the first column of the dataframe is equal to the name of the variable.

    Parameters
    ----------
    data : pandas.DataFrame
        Input for Interface Object containing in situ soil moisture measurements
    sat_point : float
            Saturation Point in % vol for soil at the respective location.
            At ISMN the saturation point is calculated from Harmonized World Soil Database (HWSD) sand, clay and organic
            content for each station using Equations [2,3,5] from Saxton & Rawls (2006).
            (Saxton, K. E., & Rawls, W. J. (2006). Soil water characteristic estimates by texture and organic matter for
            hydrologic solutions. Soil science society of America Journal, 70(5), 1569-1578. doi:10.2136/sssaj2005.0117)

    Raises
    ------
    FormatError
        if provided Input is no DataFrame or does not meet the required format

    Attributes
    ----------
    data : pandas.DataFrame
        DataFrame containing in situ soil moisture measurement

    Methods
    -------
    run(name,sat_point)
        Apply ISMN quality control to in situ soil moisture measurements

    """

    def __init__(self, data, sat_point=None, depth_from=None):
        self.data = data
        self.sat_point = sat_point
        self.depth_from = depth_from
        self.variable = None

        if not type(self.data) == pd.DataFrame:
            raise FormatError('Please provide pandas.DataFrame as data.')

        if 'soil_moisture' not in self.data.columns:
            self.variable = self.get_variable_from_data()
            self.data['qflag'] = data[self.variable].apply(lambda x: set())

        else:
            self.data['qflag'] = data.soil_moisture.apply(lambda x: set())
            self.variable = 'soil_moisture'

[docs]    def run(
        self, name=None, sat_point=None, depth_from=None, flag_numbers=False
    ) -> pd.DataFrame:
        """
        Applies all quality control algorithms when keyword name is not set. However for flag C03 a threshold value
        (saturation point: highest soil moisture value depending on soil properties) is needed as input.

        Parameters
        ----------
        name : list
            provide list of flags to only apply these flags
        sat_point : float
                Saturation Point in % vol for soil at the respective location.
                At ISMN the saturation point is calculated from Harmonized World Soil Database (HWSD) sand, clay
                and organic content for each station using Equations [2,3,5] from Saxton & Rawls (2006).
                (Saxton, K. E., & Rawls, W. J. (2006). Soil water characteristic estimates by texture and organic matter
                for hydrologic solutions. Soil science society of America Journal, 70(5), 1569-1578.)
        depth_from : Decimal
                Used to calculate minimum precipitation necessary to consitute a rain event for flags D04 and D05.
                Also used to skip sensor depths >=10cm for D04 and D05 (applied to surface soil moisture sensors only).
        flag_numbers : bool
                if true flag numbers are used as tags in the qflag column instead of flag ids (e.g.: '1' instead of
                'C01', '14' instead of 'G')

        Returns
        -------
        pandas.DataFrame
            DataFrame including ISMN quality flags in column "qflag".
        """
        keys = self.data.keys()

        if name:
            assert isinstance(name, (list)), "If 'name' is provided then it must be a list"

        if not self.sat_point:
            self.sat_point = sat_point
        if not self.depth_from:
            self.depth_from = depth_from
        if self.variable == 'soil_moisture':
            self.apply_savgol()
        if not flag_numbers:
            flags_dict = {
                'C01': self.flag_C01, 'C02': self.flag_C02, 'C03': self.flag_C03, 'D01': self.flag_D01,
                'D02': self.flag_D02, 'D03': self.flag_D03, 'D04': self.flag_D04, 'D05': self.flag_D05,
                'D06': self.flag_D06, 'D07': self.flag_D07, 'D09': self.flag_D09, 'D10': self.flag_D10,
                'G': self.flag_G,
            }
        else:
            flags_dict = {
                1: self.flag_C01, 2: self.flag_C02, 3: self.flag_C03, 4: self.flag_D01,
                5: self.flag_D02, 6: self.flag_D03, 7: self.flag_D04, 8: self.flag_D05,
                9: self.flag_D06, 10: self.flag_D07, 12: self.flag_D09, 13: self.flag_D10,
                14: self.flag_G,
            }

        if name is not None:
            if type(name) == list:
                for key in name:
                    flags_dict[key](key)
            elif type(name) == str:
                flags_dict[name](name)
        else:
            for key in flags_dict.keys():
                flags_dict[key](key)

        return self.data[keys]

[docs]    def get_flag_description(self) -> None:
        """
        Prints out table with flag codes and a short description.
        """
        names = [
            'C01', 'C02', 'C03', 'D01', 'D02', 'D03', 'D04', 'D05', 'D06', 'D07', 'D08', 'D09', 'D10', 'G',
        ]
        description = [
            'soil moisture < 0 m³ / m³',
            'soil moisture > 0.60m³ / m³',
            'soil moisture > saturation point(based on HWSD)',
            'negative soil temperature( in situ)',
            'negative air temperature( in situ)',
            'negative soil temperature (GLDAS)',
            'rise in soil moisture without precipitation( in situ)',
            'rise in soil moisture without precipitation(GLDAS)',
            'spikes',
            'negative breaks(drops)',
            'positive breaks(jumps)',
            'constant values following negative break',
            'saturated plateaus',
            'good',
        ]

        titles = ['code', 'description']
        table_create = [titles] + list(zip(names, description))

        for i, d in enumerate(table_create):
            line = ' | '.join(str(x).ljust(4) for x in d)
            print(line)
            if i == 0:
                print('-' * len(line))

[docs]    def apply_savgol(self) -> None:
        """
        Calculates and adds derivations 1 and 2 using Savitzky-Golay filter
        """
        self.data['deriv1'] = savgol(self.data.soil_moisture, 3, 2, 1, mode='nearest')
        self.data['deriv2'] = savgol(self.data.soil_moisture, 3, 2, 2, mode='nearest')


[docs]    def get_variable_from_data(self) -> str:
        """
        Gets first occuring and known Variable from the pandas dataframe
        Returns v:string
        -------

        """
        for v in self.data.keys():
            if v in t.variable_list:
                return v
        raise VariableNotKnown

[docs]    def flag_C01(self, tag):
        """
        Soil moisture below threshold:
        Flags when measurement is below threshold

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """
        low_boundary = t.low_boundary(self.variable)

        index = self.data[self.data[self.variable] < low_boundary].index

        if len(index):
            self.data['qflag'][index].apply(lambda x: x.add(tag))

[docs]    def flag_C02(self, tag):
        """
        Soil moisture above threshold:
        Flags when measurement is above threshold

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """
        upper_boundary = t.hi_boundary(self.variable)
        index = self.data[self.data[self.variable] > upper_boundary].index
        if len(index):
            self.data['qflag'][index].apply(lambda x: x.add(tag))

[docs]    def flag_C03(self, tag):
        """
        Soil moisture above saturation point:
        Flags when soil moisture is above saturation point.
        At ISMN the saturation point is calculated from Harmonized World Soil Database (HWSD) sand, clay and organic
        content for each station using Equations [2,3,5] from Saxton & Rawls (2006). doi:10.2136/vzj2012.0097.

        Parameters
        ----------
        tag : string
        code added to qflag-column when flag-criteria are met
        """
        if not self.sat_point:
            return

        index = self.data.loc[self.data.soil_moisture > self.sat_point].index
        if len(index):
            self.data['qflag'][index].apply(lambda x: x.add(tag))

[docs]    def flag_D01(self, tag):
        """
        In situ soil temperature below threshold:
        Flags when ancillary in situ soil temperature is below threshold

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """
        if 'soil_temperature' in self.data.columns:
            index = self.data[self.data.soil_temperature < t.ancillary_ts_lower].index
            if len(index):
                self.data['qflag'][index].apply(lambda x: x.add(tag))

[docs]    def flag_D02(self, tag):
        """
        In situ air temperature below threshold:
        Flags when ancillary in situ air temperature is below threshold

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """
        if 'air_temperature' in self.data.columns:
            index = self.data[self.data['air_temperature'] < t.ancillary_ta_lower].index
            if len(index):
                self.data['qflag'][index].apply(lambda x: x.add(tag))

[docs]    def flag_D03(self, tag):
        """
        GLDAS soil temperature below threshold:
        Flags when ancillary GLDAS NOAA soil temperature is below threshold

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """
        if 'gldas_soil_temperature' in self.data.columns:
            index = self.data[
                self.data['gldas_soil_temperature'] < t.ancillary_ts_lower
            ].index
            if len(index):
                self.data['qflag'][index].apply(lambda x: x.add(tag))

[docs]    def flag_D04(self, tag):
        """
        Soil moisture rise without precipitation event (in situ):
        Flags when soil moisture increased both during the last hour and during the preceding 24h (increase is larger
        than 2x std-dev during this period), yet ancillary in situ data shows there was no precipitation event greater
        or equal to the minimum precipitation (depending on sensor depth).

        At ISMN this flag is only applied to surface soil moisture sensors (<= 10cm sensor depth)

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """
        if (
            'precipitation' in self.data.columns
            or 'total_precipitation' in self.data.columns
        ):
            min_precipitation = t.ancillary_p_min

            if self.depth_from != None:
                if self.depth_from >= 0.1:
                    return
                if self.depth_from != 0:
                    min_precipitation = float(self.depth_from) * 0.05 * 0.5 * 1000

            if not 'total_precipitation' in self.data.columns:
                self.data['total_precipitation'] = round(
                    self.data['precipitation'].rolling(min_periods=1, window=24).sum(),
                    1,
                )
            self.data['std_x2'] = (
                self.data['soil_moisture'].rolling(min_periods=1, window=25).std() * 2
            )
            self.data['rise24h'] = self.data['soil_moisture'].diff(24)
            self.data['rise1h'] = self.data['soil_moisture'].diff(1)

            index = self.data[
                (self.data['rise1h'] > 0)
                & (self.data['rise24h'] > self.data['std_x2'])
                & ~np.isclose(self.data['rise24h'], self.data['std_x2'])
                & (self.data['total_precipitation'] < min_precipitation)
            ].index

            self.data['qflag'][index].apply(lambda x: x.add(tag))

[docs]    def flag_D05(self, tag):
        """
        Soil moisture rise without precipitation event (Gldas precipitation):
        Flags when soil moisture increased both during the last hour and during the preceding 24h (increase is larger
        than 2x std-dev during this period), yet ancillary GLDAS data shows there was no precipitation event greater
        or equal to the minimum precipitation (depending on sensor depth).

        At ISMN this flag is only applied to surface soil moisture sensors (<= 10cm sensor depth)

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """

        if 'gldas_precipitation' in self.data.columns:
            min_precipitation = t.ancillary_p_min
            if self.depth_from != None:
                if self.depth_from >= 0.1:
                    return
                if self.depth_from != 0:
                    min_precipitation = float(self.depth_from) * 0.05 * 0.5 * 1000

            # Add dropna in order not to stretch the period of gldas_total_precipitation for which there is no data
            self.data['gldas_total_precipitation'] = (
                self.data['gldas_precipitation']
                .dropna()
                .rolling(min_periods=1, window=24)
                .sum()
            )
            self.data['gl_std_x2'] = (
                self.data['soil_moisture'].rolling(min_periods=1, window=25).std() * 2
            )
            self.data['gl_rise24h'] = self.data['soil_moisture'].diff(24)
            self.data['gl_rise1h'] = self.data['soil_moisture'].diff(1)

            index = self.data[
                (self.data['gl_rise1h'] > 0)
                & (self.data['gl_rise24h'] > self.data['gl_std_x2'])
                & ~np.isclose(self.data['gl_rise24h'], self.data['gl_std_x2'])
                & (self.data['gldas_total_precipitation'] < min_precipitation)
            ].index

            if len(index):
                self.data['qflag'][index].apply(lambda x: x.add(tag))

[docs]    def flag_D06(self, tag):
        """
        Soil moisture spike:
        Flags when soil moisture shows a positive or negative spike.
        See Eq [4,5,6] in Dorigo et al. (2013), Global Automated Quality Control of In Situ
        Soil Moisture Data from the International Soil Moisture Network,VZJ.

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """

        def rolling_var(sm_array) -> float:
            """
            Calulates variance of soil moisture over a time-range from t-12, t+12 hours without the current value

            Parameters
            ----------
            sm_array : numpy.ndarray
                soil moisture values from t-12 to t+12 hours

            Returns
            -------
            float
                Soil moisture variance within 25 hours without the current value
            """
            sm_array = np.delete(sm_array, 12, axis=0)
            sm_array = sm_array[~np.isnan(sm_array)]
            return ((sm_array - sm_array.mean()) ** 2).sum() / (len(sm_array) - 1)

        def rolling_mean(sm_array):
            """
            mean of 12h before and after timestamp.

            Returns
            -------
            float
                Mean soil moisture within 25 hours without the current value
            """
            assert len(sm_array) == 25
            return np.delete(sm_array, 12).mean()

        def peak(sm_array) -> int:
            """
            Checks if middle element of three consecutive soil moisture measurements is a positive or negative peak or
            alternatively, if middle two values of four consecutive measurements are equal and form a positive or
            negative peak.

            Parameters
            ----------
            sm_array : numpy.ndarray
                soil moisture values from t-1 to t+1 hours

            Returns
            -------
            int
               0 (no peak)
               1 (peak)
               2 (peak that lasts 2 hours)
            """
            if ((sm_array[0] < sm_array[1]) & (sm_array[1] > sm_array[2])) | (
                (sm_array[0] > sm_array[1]) & (sm_array[1] < sm_array[2])
            ):
                return 1
            elif len(sm_array) > 3:
                if (
                    (sm_array[0] < sm_array[1])
                    & (sm_array[1] == sm_array[2])
                    & (sm_array[2] > sm_array[3])
                ) | (
                    (sm_array[0] > sm_array[1])
                    & (sm_array[1] == sm_array[2])
                    & (sm_array[2] < sm_array[3])
                ):
                    return 2
            return 0

        self.data['eq4'] = round(self.data['soil_moisture'] .shift(-1)
            .div(self.data['soil_moisture'], axis=0) .shift(1), 3)
        self.data['eq5'] = round(
            abs(self.data['deriv2'].div(self.data['deriv2'].shift(-2), axis=0).shift(1)), 3)

        # calculate relative variance at time t
        self.data['eq6'] = abs(
            self.data['soil_moisture']
            .rolling(min_periods=25, window=25, center=True)
            .apply(rolling_var, raw=True)
        ).div(
            self.data['soil_moisture']
            .rolling(window=25, center=True)
            .apply(rolling_mean, raw=True),
            axis=0,
        )

        self.data['eq_new1'] = (
            self.data['soil_moisture']
            .rolling(min_periods=3, window=4, center=True)
            .apply(peak, raw=True)
            .shift(-1)
        )

        self.data['spike_2h'] = self.data['eq_new1'].shift(1) > 1

        self.data['spike'] = (
            (
                ((self.data['eq4'] > 1.15) | (self.data['eq4'] < 0.85))
                | (self.data['spike_2h'] > 0)
            )
            & ((self.data['eq5'] > 0.8) & (self.data['eq5'] < 1.2))
            & (self.data['eq6'] < 1)
            & (self.data['eq_new1'] > 0)
        )

        index = self.data[
            (self.data.spike > 0)
            | ((self.data.spike.shift(1) > 0) & (self.data['spike_2h'] > 0))
        ].index

        self.data['qflag'][index].apply(lambda x: x.add(tag))

[docs]    def flag_D07(self, tag):
        """
        !Includes jumps (D08)!
        Soil moisture drop (D07) or jump (D08):
        Flags when time-series shows a break based on relative (and absolute) change in
        soil moisture, a comparison of the first derivatives to the average of first derivatives centered at t and a
        large negative (positive) second derivative at t followed by a large postive (negative) value at t+1.
        The resective observations are then flagged as drop "D07" (or jump "D08") when the 1. derivative at t is
        negative (positive).

        See Eq [7,8,9] in Dorigo et al. (2013), Global Automated Quality Control of In Situ
        Soil Moisture Data from the International Soil Moisture Network,VZJ.

        Includes an alternative drop type, which was not included in VJZ paper: drop from above 0.05m³/m³ to zero

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """

        self.data['absolute_change'] = self.data['soil_moisture'] - self.data[
            'soil_moisture'
        ].shift(1)
        self.data['eq7'] = abs(self.data['absolute_change'].div(self.data['soil_moisture']))
        self.data['eq8'] = abs(self.data['deriv1'].rolling(min_periods=4, window=25, center=True).mean() * 10)
        self.data['eq9'] = round(abs(self.data['deriv2'].shift(1).div(self.data['deriv2'])), 1)
        self.data['eq9a'] = abs(self.data['deriv2'].div(self.data['deriv2'].shift(-2)))

        # Include drops to zero!
        self.data['eq_new2'] = (abs(self.data['absolute_change']) > 5) & (
            self.data['soil_moisture'] == 0
        )

        index = self.data[
            (self.data['eq7'] > 0.1)
            & (abs(self.data['absolute_change']) > 1)
            & (self.data['soil_moisture'] != 0)
            & (abs(self.data['deriv1']) > self.data['eq8'])
            & ~np.isclose(abs(self.data['deriv1']), self.data['eq8'])
            & (np.isclose(self.data['eq9'], 1, atol=1e-2))
            & (self.data['deriv2'] != 0)
            & (self.data['eq9a'] > 10)
        ].index

        index_neg = index.intersection(self.data[self.data['deriv1'] < 0].index)
        index_pos = index.intersection(self.data[self.data['deriv1'] > 0].index)

        index_zero = self.data[
            self.data['eq_new2'] > 0
        ].index  # index where there are drops to zero
        index_neg = index_neg.append(index_zero)

        if len(index_neg):
            self.data['qflag'][index_neg].apply(lambda x: x.add(tag))

        # Change tag to indicate soil moisture jumps
        if isinstance(tag, int):
            tag += 1
        elif tag == 'D07':
            tag = 'D08'

        # Includes soil moisture jumps (flag D08)
        if len(index_pos):
            self.data['qflag'][index_pos].apply(lambda x: x.add(tag))

[docs]    def flag_D08(self):
        """
        !Included in flag_D07!
        """
        pass

[docs]    def flag_D09(self, tag):
        """
        Low constant values:
        Flags where a previous soil moisture break (D07) and a period of low relative variance
        (variance/mean < 0.001 m³m⁻³) coincide, soil moisture observations are flagged as "D09" as long as the
        relative variance remains below the treshold. The defined minimum duration of a low plateau is 13h.

        See Eq [14] in Dorigo et al. (2013), Global Automated Quality Control of In Situ
        Soil Moisture Data from the International Soil Moisture Network,VZJ.

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """

        self.data.dropna(subset=['soil_moisture'], inplace=True)

        # calculate relative variance
        self.data['rel_var'] = round(
            self.data['soil_moisture']
            .rolling(min_periods=13, window=13)
            .var()
            .shift(-12),
            4,
        ) / round(
            self.data['soil_moisture']
            .rolling(min_periods=13, window=13)
            .mean()
            .shift(-12),
            4,
        )

        # When sm == 0 for >12h => the mean equals 0 => relative variance is therefore calculated as nan
        # To catch periods of sm=0 after a sm-drop to zero (D07 criteria 4): reset these nan-values to 0
        self.data['rel_var'][
            self.data['rel_var'].isna() & (self.data['soil_moisture'] == 0)
        ] = 0.0

        # find where there is a drop in soil moisture (flag D07) and a period of low relative variance
        flag_D07 = 'D07'
        if type(tag) == int:
            flag_D07 = 10
        self.data['event'] = (self.data['qflag'].astype(str)).str.contains(
            str(flag_D07)
        ) & (self.data['rel_var'] < 0.001)
        self.data['event'].replace(np.nan, 0, inplace=True)
        self.data['event'] = self.data['event'].astype(int)

        # assign -1 where the "event" could end and create a pleateau_mask
        self.data.loc[
            (self.data[['rel_var']].max(1).diff() >= 0.001) & (self.data['event'] == 0),
            'event',
        ] = -1

        def plateau_mask(array_sequence) -> list:
            """
            Generates a mask where Plateau criteria are fulfilled.

            Parameters
            ----------
            array_sequence:  numpy ndarray
                sequence of 1 (Plateau criteria fulfilled), -1(Plateau criteria no longer fulfilled), 0

            Returns
            -------
            list
                sequence of 1 (Plateau criteria fulfilled) and 0 (Plateau criteria not fulfilled)
            """
            return reduce(lambda x, y: x + [max(min(x[-1] + y, 1), 0)], array_sequence, [0])[1:]

        self.data['plateau'] = plateau_mask(self.data['event'].values)

        # Extend each Plateau to at least 13h time (minimum period)
        self.data['end'] = self.data['plateau'].rolling(min_periods=13, window=13).max()

        index = self.data[self.data['end'] > 0.0].index

        if len(index):
            self.data['qflag'][index].apply(lambda x: x.add(tag))

        if type(self.data.index) == pd.core.indexes.datetimes.DatetimeIndex:
            self.data = self.data.resample('H').asfreq()

[docs]    def flag_D10(self, tag):
        """
        Invariant high soil moisture values:
        Flags where the variance of soil moisture values within 12h is below 0.05 ->  period of low variance (plv)
        with a min_len of 12h. The plv requires a rise in the first derivative of at least 0.25 in beginning of the plv
        +/- 12h and a drop in the first derivative lower or equal to 0 at the end of the plv +/- 12h a mean of the
        soil moisture values between the rise and drop (or if they occur beyond plv scope, beginning and/or respective
        end of plv) of above 0.95% of the previous highest soil moisture value ever detected (highest_sm).

        See Eq [10,11,12,13] in Dorigo et al. (2013), Global Automated Quality Control of In Situ
        Soil Moisture Data from the International Soil Moisture Network,VZJ.

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """

        def renumber_plateaus(array) -> list:
            """
            Possible plateaus are numbered consecutively.
            (e.g.: array([1,0,1,1,0,0,1,1)] -> [1,0,2,2,0,0,3,3])

            Parameters
            ----------
            array : ndarray
                1-dimensional array containing mask where variance of soil moisture observations are below 0.05 for 12h

            Returns
            -------
            seq : list
                Sequence containing rising group numbers (potential plateaus).
            """
            group = 1
            seq = []
            for a, b in zip(array, array[1:]):
                seq.append((lambda x: group if x > 0 else 0)(a))
                if a == 1 and b == 0:
                    group += 1
            return seq + [group * array[-1]]

        # Mean of plateau must be higher than 95% of this threshold;
        # For ISMN quality flags the previous 2 years of data are taken into account.
        highest_sm_value = self.data['soil_moisture'][
            self.data['soil_moisture'] < 60
        ].max()

        # Throw out datagaps - plateau can bridge gap
        self.data.dropna(subset=['soil_moisture'], inplace=True)

        # Look for periods of low variance (VAR) and assign rising numbers
        self.data.loc[:, 'VAR'] = (
            self.data['soil_moisture']
            .rolling(min_periods=12, window=12)
            .var()
            .shift(-11)
            <= 0.05
        )
        self.data['VAR_grouped'] = renumber_plateaus(self.data.VAR.values)

        # Look for maximum rise and minimum drop within 25 hours for each period of low varicance
        self.data.loc[:, 'maximum'] = (self.data['deriv1'].rolling(window=25, min_periods=1).max().shift(-12))
        self.data.loc[:, 'minimum'] = (self.data['deriv1'].rolling(window=25, min_periods=1).min().shift(-24))

        # extend pre-existing plateau in database, adds artificial starting point for plateau
        if 'd10_mask' in self.data.columns:
            self.data.loc[:, 'maximum'][self.data.d10_mask > 0] = 99

        rise = round(self.data.groupby('VAR_grouped')['maximum'].first(), 3)
        drop = round(self.data.groupby('VAR_grouped')['minimum'].last(), 3)
        rise = rise[rise >= 0.25]
        drop = drop[drop < 0]

        possible_plateaus = pd.concat([rise, drop], axis=1)
        possible_plateaus.dropna(inplace=True)

        index = []
        for idx, row in possible_plateaus.iterrows():
            # Look for possible plateaus including both a soil moisture rise and drop
            self.data['VAR_rise_drop'] = self.data.VAR_grouped[(self.data.VAR_grouped == idx)]
            VAR_period = (self.data['VAR_rise_drop'].rolling(window=12, min_periods=1).max() == idx)

            # max lies inside of VAR period
            if not self.data.index[VAR_period & (self.data['deriv1'] == row.maximum)].empty:
                max_search_period_start = self.data.index[VAR_period & (self.data['deriv1'] == row.maximum)][0]
                # min lies within VAR period
                if not self.data.index[VAR_period & (self.data['deriv1'] == row.minimum)].empty:
                    min_search_period_end = self.data.index[VAR_period & (self.data['deriv1'] == row.minimum)][0]
                # min lies outside VAR
                else:
                    min_search_period_end = VAR_period[::-1].idxmax()

            # max lies outside of VAR period
            else:
                max_search_period_start = VAR_period.idxmax()
                # mimimum within VAR period
                if not self.data.index[VAR_period & (self.data['deriv1'] == row.minimum)].empty:
                    min_search_period_end = self.data.index[VAR_period & (self.data['deriv1'] == row.minimum)][0]
                # minimum within VAR period
                else:
                    min_search_period_end = VAR_period[::-1].idxmax()

            plateau = self.data['soil_moisture'].loc[max_search_period_start:min_search_period_end]
            if 'highest_sm' in self.data.columns:
                if plateau.mean() > (
                    self.data['highest_sm']
                    .loc[max_search_period_start:min_search_period_end]
                    .mean()
                    * 0.95
                ):
                    index.extend(plateau.index)
            else:  # if no highest_sm column then use highest_sm_value as threshold
                if plateau.mean() > (highest_sm_value * 0.95):
                    index.extend(plateau.index)

        self.data['qflag'][index].apply(lambda x: x.add(tag))

[docs]    def flag_G(self, tag):
        """
        Applies tag for all unflagged observations

        Parameters
        ----------
        tag : string or int, optional
        code added to qflag-column when flag-criteria are met
        """
        index_good = self.data[self.data['qflag'] == set()].index
        self.data['qflag'][index_good].apply(lambda x: x.add(tag))