Source code for anomaly.adtk
import pandas as pd
import matplotlib.pyplot as plt
from adtk.detector import InterQuartileRangeAD
[docs]
class AnomalyDetector:
"""
A class for detecting anomalies in time series data using ADTK's InterQuartileRangeAD.
It also provides a visualization of the detected anomalies.
Parameters
----------
df : pd.DataFrame
The input DataFrame containing the time series data.
date_col : str, default "ORDERDATE"
The column name containing date/time information. This column will be set as the DataFrame's index.
value_col : str, default "PROFIT_MARGIN"
The column on which anomaly detection is performed.
"""
def __init__(self, df, date_col="ORDERDATE", value_col="PROFIT_MARGIN"):
# Work on a copy so as not to modify the original DataFrame.
self.df = df.copy()
self.date_col = date_col
self.value_col = value_col
# Ensure the DataFrame index is set to the date column.
if self.df.index.name != self.date_col:
self.df.set_index(self.date_col, inplace=True)
self.anomalies = None
[docs]
def detect(self):
"""
Detect anomalies in the specified value column using InterQuartileRangeAD.
Returns
-------
anomalies : pd.DataFrame
A DataFrame that includes:
- 'is_anamoly': a boolean flag indicating whether an anomaly was detected.
- 'value': the original value from the input DataFrame.
"""
# Initialize the ADTK detector.
detector = InterQuartileRangeAD()
# Run the detection on the selected column.
detection_df = self.df[[self.value_col]]
detected = detector.fit_detect(detection_df)
# Prepare the anomalies DataFrame:
anomalies = detected.copy()
# Add the actual values (for plotting purposes)
anomalies["value"] = detection_df[self.value_col]
# Rename the detector output column for clarity.
anomalies.rename(columns={self.value_col: "is_anamoly"}, inplace=True)
self.anomalies = anomalies
return anomalies
[docs]
def get_anomaly_dates(self):
"""
Retrieve the dates where anomalies were detected.
Returns
-------
anomaly_dates : pd.DatetimeIndex
The dates (from the DataFrame's index) where an anomaly was detected.
"""
if self.anomalies is None:
raise ValueError(
"Anomalies have not been detected yet. Please call detect() first."
)
anomaly_dates = self.anomalies[self.anomalies["is_anamoly"]].index
return pd.Series(anomaly_dates, name=anomaly_dates.name)
[docs]
def visualize(
self,
figsize=(12, 6),
title="Daily Profit Margin Over Time with Anomalies Highlighted",
xlabel="Date",
ylabel="Profit Margin Value",
ylim=(40, 60),
):
"""
Visualize the time series data with anomalies highlighted.
Parameters
----------
figsize : tuple, default (12, 6)
Size of the figure.
title : str, default "Daily Profit Margin Over Time with Anomalies Highlighted"
Title for the plot.
xlabel : str, default "Date"
Label for the x-axis.
ylabel : str, default "Profit Margin Value"
Label for the y-axis.
ylim : tuple, optional, default (40, 60)
Y-axis limits.
"""
if self.anomalies is None:
raise ValueError(
"Anomalies have not been detected yet. Please call detect() first."
)
plt.figure(figsize=figsize)
# Plot the original time series.
plt.plot(
self.anomalies.index,
self.anomalies["value"],
label="Profit Margin",
color="black",
)
# Highlight the anomaly points.
anomaly_points = self.anomalies[self.anomalies["is_anamoly"]]
plt.scatter(
anomaly_points.index,
anomaly_points["value"],
color="red",
label="Anomaly Detected",
s=100,
)
# Set plot labels and grid.
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.legend()
plt.grid(True, linestyle="--", linewidth=0.5, alpha=0.4)
if ylim is not None:
plt.ylim(ylim)
plt.show()