Get started

Inside your code, you just have to add the spot.h header file.

#include "spot.h"

One paramount point is that libspot does not know how to allocate/free memory (it does not know libc by design). So you have to provide these functions. By default you can pass the common malloc and free functions from stdlib.h.

#include "spot.h"
#include <stdlib.h>

set_allocators(malloc, free);

Ok, now you want to use the SPOT algorithm. You can allocate a Spot object either on stack or on heap.

// stack allocation
struct Spot spot;
// or heap allocation
struct Spot* spot_ptr = (struct Spot*)malloc(sizeof(struct Spot));

Then you must init that structure with the spot_init function.

// here we assume stack allocation
struct Spot spot;
// init with SPOT parameters
int status = spot_init(
        &spot, // pointer to the allocated structure
        1e-4,  // q: anomaly probability
        0,     // low: observe upper tail
        1,     // discard_anomalies: flag anomalies
        0.998, // level: tail quantile (the 0.2% higher values shapes the tail)
        200    // max_excess: number of data to keep to summarize the tail
    );
// you can check the initialization
if (status < 0) {
    // print error
    char buffer[100];
    error_msg(-status, buffer, 100);
    printf("ERROR %d: %s\n", -status, buffer);
}

Basically q is the anomaly probability. The algorithm will flag events that have a lower probability than q. In practice, it must be very low (like 1e-3 or less).

The low parameter just defines whether we flag high (low = 0) of low (low = 1) values while discard_anomalies says that we want to reject anomalies.

The level should be a high quantile (a value close to 1). It is useful to delimitate the tail of the distribution. One may use values like 0.98, 0.99 or 0.995.

Finally max_excess is the number of data that will be kept to model the tail of the distribution.

You can read more about the parameters in the dedicated section.

Before prediction, we commonly need to fit the algorithm with first data. In practice you must provide a buffer of double (pointer + size of the buffer). How many records are needed? Briefly, few thousands (like 2000) but it depends on the parameters passed to SPOT (and also whether you have enough data).

// double* initial_data = ...
// unsigned long size = ...
status = spot_fit(&spot, initial_data, size);
if (status < 0) {
    // print error
    char buffer[100];
    error_msg(-status, buffer, 100);
    printf("ERROR %d: %s\n", -status, buffer);
}

Full example

Here we present a basic example where the SPOT algorithm is run on an exponential stream.

// basic.c
// BUILD:
// $ make
// $ cc -o /tmp/basic examples/basic.c dist/libspot.a* -Idist/ -lm
// RUN:
// $ /tmp/basic [--stdlib]

#include "spot.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif

unsigned long const TRAIN_SIZE = 20000;
unsigned long const TEST_SIZE = 500000;

// N(0, 1) - Box-Muller transform
double gaussian_random() {
    double u = 1.0 - ((double)rand() / (double)RAND_MAX);
    double v = (double)rand() / (double)RAND_MAX;
    return sqrt(-2.0 * log(u)) * cos(2.0 * M_PI * v);
}

int main(int argc, const char *argv[]) {
    if (argc > 1 && strcmp(argv[1], "--stdlib") == 0) {
        set_math_functions(log, exp, pow);
    }

    // set random seed
    srand(time(NULL));
    // provide allocators to libspot
    set_allocators(malloc, free);
    // stack allocation
    struct Spot spot;
    int status = 0;
    // init the structure with some parameters
    status = spot_init(
        &spot,
        1e-4, // q: anomaly probability
        0,    // low: observe upper tail
        1,    // discard_anomalies: flag anomalies
        0.99, // level: tail quantile (the 1% higher values shapes the tail)
        200   // max_excess: number of data to keep to summarize the tail
    );

    if (status < 0) {
        return -status;
    }

    // initial data
    double train[TRAIN_SIZE];
    for (unsigned long i = 0; i < TRAIN_SIZE; i++) {
        train[i] = gaussian_random();
    }

    // fit
    status = spot_fit(&spot, train, TRAIN_SIZE);
    if (status < 0) {
        return -status;
    }

    // run
    double test[TEST_SIZE];
    for (unsigned long i = 0; i < TEST_SIZE; i++) {
        test[i] = gaussian_random();
    }
    int normal = 0;
    int excess = 0;
    int anomaly = 0;

    clock_t start = clock();
    for (unsigned long k = 0; k < TEST_SIZE; k++) {
        switch (spot_step(&spot, test[k])) {
        case ANOMALY:
            anomaly++;
            break;
        case EXCESS:
            excess++;
            break;
        case NORMAL:
            normal++;
            break;
        }
    }
    clock_t end = clock();

    double elapsed = (double)(end - start) / (double)CLOCKS_PER_SEC;

    printf("Time: %.2f ms\n", 1000.0 * elapsed);
    printf("Throughput: %.2f value/s\n", (double)TEST_SIZE / elapsed);
    printf("ANOMALY: %d, EXCESS: %d, NORMAL: %d\n", anomaly, excess, normal);
    return 0;
}