mirror of https://github.com/CGAL/cgal
377 lines
13 KiB
C++
377 lines
13 KiB
C++
// Copyright (c) 2014 Stefan Walk
|
|
//
|
|
// This file is part of CGAL (www.cgal.org).
|
|
//
|
|
// $URL$
|
|
// $Id$
|
|
// SPDX-License-Identifier: LicenseRef-RFL
|
|
// License notice in Installation/LICENSE.RFL
|
|
//
|
|
// Author(s) : Stefan Walk
|
|
|
|
// Modifications from original library:
|
|
// * changed inclusion protection tag
|
|
// * moved to namespace CGAL::internal::
|
|
// * init_feature_class_data() does not resize anymore (it's done
|
|
// later directly in the splitter). WARNING: all splitters other
|
|
// than the default won't be working correctly (but experimentally
|
|
// they are less good and we don't use them - we keep them just in
|
|
// case)
|
|
// * sample reduction is now 36.8% (to account for the correction of
|
|
// the randomization of the input which used to implicitly ignore
|
|
// this proportion of items)
|
|
// * map_points() in axis aligned splitter now only uses a subset of
|
|
// the points for evaluation (for timing optimization=
|
|
|
|
#ifndef CGAL_INTERNAL_LIBLEARNING_RANDOMFOREST_COMMON_LIBRARIES_H
|
|
#define CGAL_INTERNAL_LIBLEARNING_RANDOMFOREST_COMMON_LIBRARIES_H
|
|
#include <algorithm>
|
|
#include <numeric>
|
|
#include <limits>
|
|
#include <list>
|
|
#include <CGAL/IO/binary_file_io.h>
|
|
#include <boost/version.hpp>
|
|
#include <boost/bind/bind.hpp>
|
|
#include <boost/ptr_container/ptr_vector.hpp>
|
|
#include <boost/random/mersenne_twister.hpp>
|
|
#include <boost/random/uniform_int_distribution.hpp>
|
|
#include <boost/random/uniform_01.hpp>
|
|
#include <boost/random/normal_distribution.hpp>
|
|
#if defined(CGAL_LINKED_WITH_BOOST_IOSTREAMS) && defined(CGAL_LINKED_WITH_BOOST_SERIALIZATION)
|
|
#include <boost/serialization/vector.hpp>
|
|
#endif
|
|
#include <boost/scoped_ptr.hpp>
|
|
#include <memory>
|
|
#include <boost/make_shared.hpp>
|
|
#include <unordered_set>
|
|
#include <iostream>
|
|
#include <cstdio>
|
|
|
|
#include "../dataview.h"
|
|
|
|
namespace CGAL { namespace internal {
|
|
|
|
namespace liblearning {
|
|
namespace RandomForest {
|
|
|
|
typedef std::vector< std::pair<float, int> > FeatureClassDataFloat;
|
|
inline void init_feature_class_data(FeatureClassDataFloat& /*data*/, int /*n_classes*/, int /* n_samples */)
|
|
{
|
|
// data.resize(n_samples);
|
|
}
|
|
typedef std::unordered_set<int> FeatureSet;
|
|
|
|
typedef boost::random::uniform_int_distribution<> UniformIntDist;
|
|
typedef boost::random::normal_distribution<> NormalDist;
|
|
typedef boost::random::mt19937 RandomGen;
|
|
typedef boost::random::uniform_01<> UnitDist;
|
|
|
|
struct ForestParams {
|
|
size_t n_classes;
|
|
size_t n_features;
|
|
size_t n_samples;
|
|
size_t n_in_bag_samples;
|
|
size_t max_depth;
|
|
size_t n_trees;
|
|
size_t min_samples_per_node;
|
|
float sample_reduction;
|
|
ForestParams() :
|
|
n_classes(0),
|
|
n_features(0),
|
|
n_samples(0),
|
|
n_in_bag_samples(0),
|
|
max_depth(42),
|
|
n_trees(100),
|
|
min_samples_per_node(5),
|
|
sample_reduction(0.368f)
|
|
{}
|
|
#if defined(CGAL_LINKED_WITH_BOOST_IOSTREAMS) && defined(CGAL_LINKED_WITH_BOOST_SERIALIZATION)
|
|
template <typename Archive>
|
|
void serialize(Archive& ar, unsigned /*version*/)
|
|
{
|
|
ar & BOOST_SERIALIZATION_NVP(n_classes);
|
|
ar & BOOST_SERIALIZATION_NVP(n_features);
|
|
ar & BOOST_SERIALIZATION_NVP(n_samples);
|
|
ar & BOOST_SERIALIZATION_NVP(n_in_bag_samples);
|
|
ar & BOOST_SERIALIZATION_NVP(max_depth);
|
|
ar & BOOST_SERIALIZATION_NVP(n_trees);
|
|
ar & BOOST_SERIALIZATION_NVP(min_samples_per_node);
|
|
ar & BOOST_SERIALIZATION_NVP(sample_reduction);
|
|
}
|
|
#endif
|
|
|
|
void write (std::ostream& os)
|
|
{
|
|
I_Binary_write_size_t_into_uinteger32 (os, n_classes);
|
|
I_Binary_write_size_t_into_uinteger32 (os, n_features);
|
|
I_Binary_write_size_t_into_uinteger32 (os, n_samples);
|
|
I_Binary_write_size_t_into_uinteger32 (os, n_in_bag_samples);
|
|
I_Binary_write_size_t_into_uinteger32 (os, max_depth);
|
|
I_Binary_write_size_t_into_uinteger32 (os, n_trees);
|
|
I_Binary_write_size_t_into_uinteger32 (os, min_samples_per_node);
|
|
I_Binary_write_float32 (os, sample_reduction);
|
|
}
|
|
|
|
void read (std::istream& is)
|
|
{
|
|
I_Binary_read_size_t_from_uinteger32 (is, n_classes);
|
|
I_Binary_read_size_t_from_uinteger32 (is, n_features);
|
|
I_Binary_read_size_t_from_uinteger32 (is, n_samples);
|
|
I_Binary_read_size_t_from_uinteger32 (is, n_in_bag_samples);
|
|
I_Binary_read_size_t_from_uinteger32 (is, max_depth);
|
|
I_Binary_read_size_t_from_uinteger32 (is, n_trees);
|
|
I_Binary_read_size_t_from_uinteger32 (is, min_samples_per_node);
|
|
I_Binary_read_float32 (is, sample_reduction);
|
|
}
|
|
};
|
|
|
|
struct QuadraticSplitter {
|
|
typedef float FeatureType;
|
|
typedef FeatureClassDataFloat FeatureClassData;
|
|
int n_features;
|
|
std::vector<FeatureType> w;
|
|
FeatureType threshold;
|
|
QuadraticSplitter() : n_features(0) {}
|
|
QuadraticSplitter(int n_features, std::vector<FeatureType> const& w) :
|
|
n_features(n_features), w(w)
|
|
{}
|
|
void set_threshold(FeatureType new_threshold) {
|
|
threshold = new_threshold;
|
|
}
|
|
FeatureType map_sample(FeatureType const* v) const {
|
|
double result = 0.0;
|
|
int i_feature = 0;
|
|
for (; i_feature < n_features; ++i_feature) {
|
|
result += w[i_feature] * v[i_feature];
|
|
}
|
|
for (int i_1 = 0; i_1 < n_features; ++i_1) {
|
|
for (int i_2 = 0; i_2 < n_features; ++i_2) {
|
|
result += w[i_feature++] * v[i_1] * v[i_2];
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
bool classify_sample(FeatureType const* v) const {
|
|
return map_sample(v) > threshold;
|
|
}
|
|
void map_points(DataView2D<FeatureType> samples,
|
|
DataView2D<int> labels,
|
|
int const* sample_idxes,
|
|
int n_samples,
|
|
FeatureClassData& data_points) const
|
|
{
|
|
for (int i_sample = 0; i_sample < n_samples; ++i_sample) {
|
|
int sample_idx = sample_idxes[i_sample];
|
|
int sample_class = labels(sample_idx, 0);
|
|
FeatureType sample_fval = map_sample(samples.row_pointer(sample_idx));
|
|
data_points[i_sample] = std::make_pair(sample_fval, sample_class);
|
|
}
|
|
}
|
|
#if defined(CGAL_LINKED_WITH_BOOST_IOSTREAMS) && defined(CGAL_LINKED_WITH_BOOST_SERIALIZATION)
|
|
template <typename Archive>
|
|
void serialize(Archive& ar, unsigned /*version*/)
|
|
{
|
|
ar & BOOST_SERIALIZATION_NVP(n_features);
|
|
ar & BOOST_SERIALIZATION_NVP(w);
|
|
ar & BOOST_SERIALIZATION_NVP(threshold);
|
|
}
|
|
#endif
|
|
};
|
|
|
|
struct LinearSplitter {
|
|
typedef float FeatureType;
|
|
typedef FeatureClassDataFloat FeatureClassData;
|
|
std::vector<FeatureType> w;
|
|
FeatureType threshold;
|
|
LinearSplitter() {}
|
|
LinearSplitter(std::vector<FeatureType> const& w) :
|
|
w(w)
|
|
{}
|
|
void set_threshold(FeatureType new_threshold) {
|
|
threshold = new_threshold;
|
|
}
|
|
bool classify_sample(FeatureType const* v) const {
|
|
return std::inner_product(w.begin(), w.end(), v, 0.0f) > threshold;
|
|
}
|
|
void map_points(DataView2D<FeatureType> samples,
|
|
DataView2D<int> labels,
|
|
int const* sample_idxes,
|
|
int n_samples,
|
|
FeatureClassData& data_points) const
|
|
{
|
|
for (int i_sample = 0; i_sample < n_samples; ++i_sample) {
|
|
int sample_idx = sample_idxes[i_sample];
|
|
int sample_class = labels(sample_idx, 0);
|
|
FeatureType sample_fval = std::inner_product(w.begin(), w.end(),
|
|
samples.row_pointer(sample_idx), 0.0f);
|
|
data_points[i_sample] = std::make_pair(sample_fval, sample_class);
|
|
}
|
|
}
|
|
#if defined(CGAL_LINKED_WITH_BOOST_IOSTREAMS) && defined(CGAL_LINKED_WITH_BOOST_SERIALIZATION)
|
|
template <typename Archive>
|
|
void serialize(Archive& ar, unsigned /*version*/)
|
|
{
|
|
ar & BOOST_SERIALIZATION_NVP(w);
|
|
ar & BOOST_SERIALIZATION_NVP(threshold);
|
|
}
|
|
#endif
|
|
};
|
|
|
|
struct AxisAlignedSplitter {
|
|
typedef float FeatureType;
|
|
typedef FeatureClassDataFloat FeatureClassData;
|
|
int feature;
|
|
FeatureType threshold;
|
|
AxisAlignedSplitter() : feature(-1) {}
|
|
AxisAlignedSplitter(int feature) :
|
|
feature(feature)
|
|
{}
|
|
void set_threshold(FeatureType new_threshold) {
|
|
threshold = new_threshold;
|
|
}
|
|
bool classify_sample(FeatureType const* v) const {
|
|
return v[feature] > threshold;
|
|
}
|
|
void map_points(DataView2D<FeatureType> samples,
|
|
DataView2D<int> labels,
|
|
int const* sample_idxes,
|
|
int n_samples,
|
|
FeatureClassData& data_points) const
|
|
{
|
|
std::size_t size = (std::min)(std::size_t(5000), std::size_t(n_samples));
|
|
data_points.clear();
|
|
data_points.reserve(size);
|
|
|
|
std::size_t step = n_samples / size;
|
|
|
|
for (int i_sample = 0; i_sample < n_samples; i_sample += step) {
|
|
// determine index of this sample ...
|
|
int sample_idx = sample_idxes[i_sample];
|
|
// determine class ...
|
|
int sample_class = labels(sample_idx, 0);
|
|
// determine value of the selected feature for this sample
|
|
FeatureType sample_fval = samples(sample_idx, feature);
|
|
data_points.push_back(std::make_pair(sample_fval, sample_class));
|
|
}
|
|
}
|
|
#if defined(CGAL_LINKED_WITH_BOOST_IOSTREAMS) && defined(CGAL_LINKED_WITH_BOOST_SERIALIZATION)
|
|
template <typename Archive>
|
|
void serialize(Archive& ar, unsigned /*version*/)
|
|
{
|
|
ar & BOOST_SERIALIZATION_NVP(feature);
|
|
ar & BOOST_SERIALIZATION_NVP(threshold);
|
|
}
|
|
#endif
|
|
|
|
void write (std::ostream& os)
|
|
{
|
|
os.write((char*)(&feature), sizeof(int));
|
|
os.write((char*)(&threshold), sizeof(FeatureType));
|
|
}
|
|
|
|
void read (std::istream& is)
|
|
{
|
|
is.read((char*)(&feature), sizeof(int));
|
|
is.read((char*)(&threshold), sizeof(FeatureType));
|
|
}
|
|
};
|
|
|
|
struct AxisAlignedRandomSplitGenerator {
|
|
typedef float FeatureType;
|
|
FeatureSet features;
|
|
FeatureSet::const_iterator it;
|
|
|
|
void init(DataView2D<FeatureType> samples,
|
|
DataView2D<int> /*labels*/,
|
|
int* /*sample_idxes*/,
|
|
int /*n_samples*/,
|
|
size_t /*n_classes*/,
|
|
RandomGen& gen)
|
|
{
|
|
features.clear();
|
|
int n_features = samples.cols;
|
|
size_t n_used_features = std::sqrt(n_features);
|
|
UniformIntDist dist(0, n_features - 1);
|
|
// insert into set until required number of unique features is found
|
|
while (features.size() < n_used_features) {
|
|
features.insert(dist(gen));
|
|
}
|
|
it = features.begin();
|
|
}
|
|
AxisAlignedSplitter gen_proposal(RandomGen& /*gen*/)
|
|
{
|
|
if (it == features.end()) {
|
|
it = features.begin();
|
|
}
|
|
return AxisAlignedSplitter(*it++);
|
|
}
|
|
size_t num_proposals() const {
|
|
return features.size();
|
|
}
|
|
};
|
|
|
|
struct LinearSplitGenerator {
|
|
typedef float FeatureType;
|
|
size_t n_features;
|
|
size_t n_proposals;
|
|
LinearSplitGenerator(size_t n_proposals = 5) :
|
|
n_proposals(n_proposals)
|
|
{}
|
|
void init(DataView2D<FeatureType> samples,
|
|
DataView2D<int> /*labels*/,
|
|
int* /*sample_idxes*/,
|
|
int /*n_samples*/,
|
|
size_t /*n_classes*/,
|
|
RandomGen& /*gen*/)
|
|
{
|
|
n_features = samples.cols;
|
|
}
|
|
size_t num_proposals() const {
|
|
return n_proposals;
|
|
}
|
|
LinearSplitter gen_proposal(RandomGen& gen) {
|
|
NormalDist dist;
|
|
std::vector<FeatureType> weights(n_features);
|
|
for (size_t i_feature = 0; i_feature < n_features; ++i_feature) {
|
|
weights[i_feature] = dist(gen);
|
|
}
|
|
return LinearSplitter(weights);
|
|
}
|
|
};
|
|
|
|
struct QuadraticSplitGenerator {
|
|
typedef float FeatureType;
|
|
size_t n_features;
|
|
size_t n_proposals;
|
|
QuadraticSplitGenerator(size_t n_proposals = 5) :
|
|
n_proposals(n_proposals)
|
|
{}
|
|
void init(DataView2D<FeatureType> samples,
|
|
DataView2D<int> /*labels*/,
|
|
int* /*sample_idxes*/,
|
|
int /*n_samples*/,
|
|
size_t /*n_classes*/,
|
|
RandomGen& /*gen*/)
|
|
{
|
|
n_features = samples.cols;
|
|
}
|
|
size_t num_proposals() const {
|
|
return n_proposals;
|
|
}
|
|
QuadraticSplitter gen_proposal(RandomGen& gen) {
|
|
NormalDist dist;
|
|
std::vector<FeatureType> weights(n_features + n_features*n_features);
|
|
for (size_t i_feature = 0; i_feature < weights.size(); ++i_feature) {
|
|
weights[i_feature] = dist(gen);
|
|
}
|
|
return QuadraticSplitter(n_features, weights);
|
|
}
|
|
};
|
|
|
|
}
|
|
}
|
|
|
|
}} // namespace CGAL::internal::
|
|
|
|
#endif
|