#forest #isolation #detection #algorithm #tree #julia #python

isolation_forest

隔离森林异常检测算法的实现

1 个稳定版本

1.1.0 2021年5月5日

#1734 in 算法

MIT 许可证

24KB
393

C++ Rust Python 2.7|3.7 MIT license

LibIsolationForest

描述

本项目包含 Rust、C++、Julia 和 Python 对隔离森林算法的实现。隔离森林是一种基于随机生成的决策树集的异常检测算法。关于算法的完整描述,请参阅算法创造者的原始论文

https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf

Python 示例

可以使用 pip 安装 Python 实现

pip install IsolationForest

这是一个展示如何使用库的 Python 版本代码片段。您还可以阅读文件 test.py 以获取完整示例。随着库的成熟,我将在此文件中添加更多测试示例。

from isolationforest import IsolationForest

forest = IsolationForest.Forest(num_trees, sub_sampling_size)

sample = IsolationForest.Sample("Training Sample 1")
features = []
features.append({"feature 1": feature_1_value})
# Add more features to the sample...
features.append({"feature N": feature_N_value})
sample.add_features(features)
# Add the features to the sample.
forest.add_sample(sample)
# Add more samples to the forest...

# Create the forest.
forest.create()

sample = IsolationForest.Sample("Test Sample 1")
features = []
features.append({"feature 1": feature_1_value})
# Add more features to the sample...
features.append({"feature N": feature_N_value})
# Add the features to the sample.
sample.add_features(features)

# Score the sample.
score = forest.score(sample)
normalized_score = forest.normalized_score(sample)

Rust 示例

可以在 lib.rs 中找到更多使用 Rust 版本库的示例。随着库的成熟,我将在此文件中添加更多测试示例。

let file_path = "../data/iris.data.txt";
let file = match std::fs::File::open(&file_path) {
    Err(why) => panic!("Couldn't open {} {}", file_path, why),
    Ok(file) => file,
};

let mut reader = csv::Reader::from_reader(file);
let mut forest = crate::isolation_forest::Forest::new(10, 10);
let training_class_name = "Iris-setosa";
let mut training_samples = Vec::new();
let mut test_samples = Vec::new();
let mut avg_control_set_score = 0.0;
let mut avg_outlier_set_score = 0.0;
let mut avg_control_set_normalized_score = 0.0;
let mut avg_outlier_set_normalized_score = 0.0;
let mut num_control_tests = 0;
let mut num_outlier_tests = 0;
let mut rng = rand::thread_rng();
let range = Uniform::from(0..10);

for record in reader.records() {
    let record = record.unwrap();

    let sepal_length_cm: f64 = record[0].parse().unwrap();
    let sepal_width_cm: f64 = record[1].parse().unwrap();
    let petal_length_cm: f64 = record[2].parse().unwrap();
    let petal_width_cm: f64 = record[3].parse().unwrap();
    let name: String = record[4].parse().unwrap();

    let mut features = crate::isolation_forest::FeatureList::new();
    features.push(crate::isolation_forest::Feature::new("sepal length in cm", (sepal_length_cm * 10.0) as u64));
    features.push(crate::isolation_forest::Feature::new("sepal width in cm", (sepal_width_cm * 10.0) as u64));
    features.push(crate::isolation_forest::Feature::new("petal length in cm", (petal_length_cm * 10.0) as u64));
    features.push(crate::isolation_forest::Feature::new("petal width in cm", (petal_width_cm * 10.0) as u64));

    let mut sample = crate::isolation_forest::Sample::new(&name);
    sample.add_features(&mut features);

    // Randomly split the samples into training and test samples.
    let x = range.sample(&mut rng) as u64;
    if x > 5 && name == training_class_name {
        forest.add_sample(sample.clone());
        training_samples.push(sample);
    }
    else {
        test_samples.push(sample);
    }
}

// Create the forest.
forest.create();

// Use each test sample.
for test_sample in test_samples {
    let score = forest.score(&test_sample);
    let normalized_score = forest.normalized_score(&test_sample);

    if training_class_name == test_sample.name {
        avg_control_set_score = avg_control_set_score + score;
        avg_control_set_normalized_score = avg_control_set_normalized_score + normalized_score;
        num_control_tests = num_control_tests + 1;
    }
    else {
        avg_outlier_set_score = avg_outlier_set_score + score;
        avg_outlier_set_normalized_score = avg_outlier_set_normalized_score + normalized_score;
        num_outlier_tests = num_outlier_tests + 1;
    }
}

// Compute statistics.
if num_control_tests > 0 {
    avg_control_set_score = avg_control_set_score / num_control_tests as f64;
    avg_control_set_normalized_score = avg_control_set_normalized_score / num_control_tests as f64;
}
if num_outlier_tests > 0 {
    avg_outlier_set_score = avg_outlier_set_score / num_outlier_tests as f64;
    avg_outlier_set_normalized_score = avg_outlier_set_normalized_score / num_outlier_tests as f64;
}

println!("Avg Control Score: {}", avg_control_set_score);
println!("Avg Control Normalized Score: {}", avg_control_set_normalized_score);
println!("Avg Outlier Score: {}", avg_outlier_set_score);
println!("Avg Outlier Normalized Score: {}", avg_outlier_set_normalized_score);

C++ 示例

可以在 main.cpp 中找到使用 C++ 版本库的示例。随着库的成熟,我将在此文件中添加更多测试示例。

Julia 示例

可以在 test.jl 中找到使用 Julia 版本库的示例。随着库的成熟,我将在此文件中添加更多测试示例。

版本历史

1.0

  • 初始版本。

1.1

  • 添加了标准化分数。
  • 更新了 Rust 中的随机数生成,因为它再次更改了。

许可证

本库采用 MIT 许可证发布,有关详细信息请参阅 LICENSE。

依赖关系

~2–3MB
~49K SLoC