cgal/Classification/include/CGAL/Classification/ETHZ/internal/random-forest/common-libraries.hpp

377 lines
13 KiB
C++

// Copyright (c) 2014 Stefan Walk
//
// This file is part of CGAL (www.cgal.org).
//
// $URL$
// $Id$
// SPDX-License-Identifier: LicenseRef-RFL
// License notice in Installation/LICENSE.RFL
//
// Author(s) : Stefan Walk
// Modifications from original library:
// * changed inclusion protection tag
// * moved to namespace CGAL::internal::
// * init_feature_class_data() does not resize anymore (it's done
// later directly in the splitter). WARNING: all splitters other
// than the default won't be working correctly (but experimentally
// they are less good and we don't use them - we keep them just in
// case)
// * sample reduction is now 36.8% (to account for the correction of
// the randomization of the input which used to implicitly ignore
// this proportion of items)
// * map_points() in axis aligned splitter now only uses a subset of
// the points for evaluation (for timing optimization=
#ifndef CGAL_INTERNAL_LIBLEARNING_RANDOMFOREST_COMMON_LIBRARIES_H
#define CGAL_INTERNAL_LIBLEARNING_RANDOMFOREST_COMMON_LIBRARIES_H
#include <algorithm>
#include <numeric>
#include <limits>
#include <list>
#include <CGAL/IO/binary_file_io.h>
#include <boost/version.hpp>
#include <boost/bind/bind.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int_distribution.hpp>
#include <boost/random/uniform_01.hpp>
#include <boost/random/normal_distribution.hpp>
#if defined(CGAL_LINKED_WITH_BOOST_IOSTREAMS) && defined(CGAL_LINKED_WITH_BOOST_SERIALIZATION)
#include <boost/serialization/vector.hpp>
#endif
#include <boost/scoped_ptr.hpp>
#include <memory>
#include <boost/make_shared.hpp>
#include <unordered_set>
#include <iostream>
#include <cstdio>
#include "../dataview.h"
namespace CGAL { namespace internal {
namespace liblearning {
namespace RandomForest {
typedef std::vector< std::pair<float, int> > FeatureClassDataFloat;
inline void init_feature_class_data(FeatureClassDataFloat& /*data*/, int /*n_classes*/, int /* n_samples */)
{
// data.resize(n_samples);
}
typedef std::unordered_set<int> FeatureSet;
typedef boost::random::uniform_int_distribution<> UniformIntDist;
typedef boost::random::normal_distribution<> NormalDist;
typedef boost::random::mt19937 RandomGen;
typedef boost::random::uniform_01<> UnitDist;
struct ForestParams {
size_t n_classes;
size_t n_features;
size_t n_samples;
size_t n_in_bag_samples;
size_t max_depth;
size_t n_trees;
size_t min_samples_per_node;
float sample_reduction;
ForestParams() :
n_classes(0),
n_features(0),
n_samples(0),
n_in_bag_samples(0),
max_depth(42),
n_trees(100),
min_samples_per_node(5),
sample_reduction(0.368f)
{}
#if defined(CGAL_LINKED_WITH_BOOST_IOSTREAMS) && defined(CGAL_LINKED_WITH_BOOST_SERIALIZATION)
template <typename Archive>
void serialize(Archive& ar, unsigned /*version*/)
{
ar & BOOST_SERIALIZATION_NVP(n_classes);
ar & BOOST_SERIALIZATION_NVP(n_features);
ar & BOOST_SERIALIZATION_NVP(n_samples);
ar & BOOST_SERIALIZATION_NVP(n_in_bag_samples);
ar & BOOST_SERIALIZATION_NVP(max_depth);
ar & BOOST_SERIALIZATION_NVP(n_trees);
ar & BOOST_SERIALIZATION_NVP(min_samples_per_node);
ar & BOOST_SERIALIZATION_NVP(sample_reduction);
}
#endif
void write (std::ostream& os)
{
I_Binary_write_size_t_into_uinteger32 (os, n_classes);
I_Binary_write_size_t_into_uinteger32 (os, n_features);
I_Binary_write_size_t_into_uinteger32 (os, n_samples);
I_Binary_write_size_t_into_uinteger32 (os, n_in_bag_samples);
I_Binary_write_size_t_into_uinteger32 (os, max_depth);
I_Binary_write_size_t_into_uinteger32 (os, n_trees);
I_Binary_write_size_t_into_uinteger32 (os, min_samples_per_node);
I_Binary_write_float32 (os, sample_reduction);
}
void read (std::istream& is)
{
I_Binary_read_size_t_from_uinteger32 (is, n_classes);
I_Binary_read_size_t_from_uinteger32 (is, n_features);
I_Binary_read_size_t_from_uinteger32 (is, n_samples);
I_Binary_read_size_t_from_uinteger32 (is, n_in_bag_samples);
I_Binary_read_size_t_from_uinteger32 (is, max_depth);
I_Binary_read_size_t_from_uinteger32 (is, n_trees);
I_Binary_read_size_t_from_uinteger32 (is, min_samples_per_node);
I_Binary_read_float32 (is, sample_reduction);
}
};
struct QuadraticSplitter {
typedef float FeatureType;
typedef FeatureClassDataFloat FeatureClassData;
int n_features;
std::vector<FeatureType> w;
FeatureType threshold;
QuadraticSplitter() : n_features(0) {}
QuadraticSplitter(int n_features, std::vector<FeatureType> const& w) :
n_features(n_features), w(w)
{}
void set_threshold(FeatureType new_threshold) {
threshold = new_threshold;
}
FeatureType map_sample(FeatureType const* v) const {
double result = 0.0;
int i_feature = 0;
for (; i_feature < n_features; ++i_feature) {
result += w[i_feature] * v[i_feature];
}
for (int i_1 = 0; i_1 < n_features; ++i_1) {
for (int i_2 = 0; i_2 < n_features; ++i_2) {
result += w[i_feature++] * v[i_1] * v[i_2];
}
}
return result;
}
bool classify_sample(FeatureType const* v) const {
return map_sample(v) > threshold;
}
void map_points(DataView2D<FeatureType> samples,
DataView2D<int> labels,
int const* sample_idxes,
int n_samples,
FeatureClassData& data_points) const
{
for (int i_sample = 0; i_sample < n_samples; ++i_sample) {
int sample_idx = sample_idxes[i_sample];
int sample_class = labels(sample_idx, 0);
FeatureType sample_fval = map_sample(samples.row_pointer(sample_idx));
data_points[i_sample] = std::make_pair(sample_fval, sample_class);
}
}
#if defined(CGAL_LINKED_WITH_BOOST_IOSTREAMS) && defined(CGAL_LINKED_WITH_BOOST_SERIALIZATION)
template <typename Archive>
void serialize(Archive& ar, unsigned /*version*/)
{
ar & BOOST_SERIALIZATION_NVP(n_features);
ar & BOOST_SERIALIZATION_NVP(w);
ar & BOOST_SERIALIZATION_NVP(threshold);
}
#endif
};
struct LinearSplitter {
typedef float FeatureType;
typedef FeatureClassDataFloat FeatureClassData;
std::vector<FeatureType> w;
FeatureType threshold;
LinearSplitter() {}
LinearSplitter(std::vector<FeatureType> const& w) :
w(w)
{}
void set_threshold(FeatureType new_threshold) {
threshold = new_threshold;
}
bool classify_sample(FeatureType const* v) const {
return std::inner_product(w.begin(), w.end(), v, 0.0f) > threshold;
}
void map_points(DataView2D<FeatureType> samples,
DataView2D<int> labels,
int const* sample_idxes,
int n_samples,
FeatureClassData& data_points) const
{
for (int i_sample = 0; i_sample < n_samples; ++i_sample) {
int sample_idx = sample_idxes[i_sample];
int sample_class = labels(sample_idx, 0);
FeatureType sample_fval = std::inner_product(w.begin(), w.end(),
samples.row_pointer(sample_idx), 0.0f);
data_points[i_sample] = std::make_pair(sample_fval, sample_class);
}
}
#if defined(CGAL_LINKED_WITH_BOOST_IOSTREAMS) && defined(CGAL_LINKED_WITH_BOOST_SERIALIZATION)
template <typename Archive>
void serialize(Archive& ar, unsigned /*version*/)
{
ar & BOOST_SERIALIZATION_NVP(w);
ar & BOOST_SERIALIZATION_NVP(threshold);
}
#endif
};
struct AxisAlignedSplitter {
typedef float FeatureType;
typedef FeatureClassDataFloat FeatureClassData;
int feature;
FeatureType threshold;
AxisAlignedSplitter() : feature(-1) {}
AxisAlignedSplitter(int feature) :
feature(feature)
{}
void set_threshold(FeatureType new_threshold) {
threshold = new_threshold;
}
bool classify_sample(FeatureType const* v) const {
return v[feature] > threshold;
}
void map_points(DataView2D<FeatureType> samples,
DataView2D<int> labels,
int const* sample_idxes,
int n_samples,
FeatureClassData& data_points) const
{
std::size_t size = (std::min)(std::size_t(5000), std::size_t(n_samples));
data_points.clear();
data_points.reserve(size);
std::size_t step = n_samples / size;
for (int i_sample = 0; i_sample < n_samples; i_sample += step) {
// determine index of this sample ...
int sample_idx = sample_idxes[i_sample];
// determine class ...
int sample_class = labels(sample_idx, 0);
// determine value of the selected feature for this sample
FeatureType sample_fval = samples(sample_idx, feature);
data_points.push_back(std::make_pair(sample_fval, sample_class));
}
}
#if defined(CGAL_LINKED_WITH_BOOST_IOSTREAMS) && defined(CGAL_LINKED_WITH_BOOST_SERIALIZATION)
template <typename Archive>
void serialize(Archive& ar, unsigned /*version*/)
{
ar & BOOST_SERIALIZATION_NVP(feature);
ar & BOOST_SERIALIZATION_NVP(threshold);
}
#endif
void write (std::ostream& os)
{
os.write((char*)(&feature), sizeof(int));
os.write((char*)(&threshold), sizeof(FeatureType));
}
void read (std::istream& is)
{
is.read((char*)(&feature), sizeof(int));
is.read((char*)(&threshold), sizeof(FeatureType));
}
};
struct AxisAlignedRandomSplitGenerator {
typedef float FeatureType;
FeatureSet features;
FeatureSet::const_iterator it;
void init(DataView2D<FeatureType> samples,
DataView2D<int> /*labels*/,
int* /*sample_idxes*/,
int /*n_samples*/,
size_t /*n_classes*/,
RandomGen& gen)
{
features.clear();
int n_features = samples.cols;
size_t n_used_features = std::sqrt(n_features);
UniformIntDist dist(0, n_features - 1);
// insert into set until required number of unique features is found
while (features.size() < n_used_features) {
features.insert(dist(gen));
}
it = features.begin();
}
AxisAlignedSplitter gen_proposal(RandomGen& /*gen*/)
{
if (it == features.end()) {
it = features.begin();
}
return AxisAlignedSplitter(*it++);
}
size_t num_proposals() const {
return features.size();
}
};
struct LinearSplitGenerator {
typedef float FeatureType;
size_t n_features;
size_t n_proposals;
LinearSplitGenerator(size_t n_proposals = 5) :
n_proposals(n_proposals)
{}
void init(DataView2D<FeatureType> samples,
DataView2D<int> /*labels*/,
int* /*sample_idxes*/,
int /*n_samples*/,
size_t /*n_classes*/,
RandomGen& /*gen*/)
{
n_features = samples.cols;
}
size_t num_proposals() const {
return n_proposals;
}
LinearSplitter gen_proposal(RandomGen& gen) {
NormalDist dist;
std::vector<FeatureType> weights(n_features);
for (size_t i_feature = 0; i_feature < n_features; ++i_feature) {
weights[i_feature] = dist(gen);
}
return LinearSplitter(weights);
}
};
struct QuadraticSplitGenerator {
typedef float FeatureType;
size_t n_features;
size_t n_proposals;
QuadraticSplitGenerator(size_t n_proposals = 5) :
n_proposals(n_proposals)
{}
void init(DataView2D<FeatureType> samples,
DataView2D<int> /*labels*/,
int* /*sample_idxes*/,
int /*n_samples*/,
size_t /*n_classes*/,
RandomGen& /*gen*/)
{
n_features = samples.cols;
}
size_t num_proposals() const {
return n_proposals;
}
QuadraticSplitter gen_proposal(RandomGen& gen) {
NormalDist dist;
std::vector<FeatureType> weights(n_features + n_features*n_features);
for (size_t i_feature = 0; i_feature < weights.size(); ++i_feature) {
weights[i_feature] = dist(gen);
}
return QuadraticSplitter(n_features, weights);
}
};
}
}
}} // namespace CGAL::internal::
#endif