BALL 1.5.0
Loading...
Searching...
No Matches
QSARData.h
Go to the documentation of this file.
1// -*- Mode: C++; tab-width: 2; -*-
2// vi: set ts=2:
3//
4//
5
6#ifndef QSARH
7#define QSARH
8
9#include <iostream>
10#include <BALL/KERNEL/system.h>
11#include <BALL/FORMAT/SDFile.h>
12#include <BALL/FORMAT/PDBFile.h>
13#include <BALL/FORMAT/HINFile.h>
14#include <BALL/FORMAT/MOLFile.h>
15#include <vector>
16#include <list>
17#include <set>
18#include <map>
19#include <cmath>
20#include <sstream>
21#include <fstream>
22#include <limits>
23#include <fstream>
29#include <cstring>
30
31#ifndef STATISTICS
33#endif
34
35#ifndef QSAR_EXCEPTION
36#include <BALL/QSAR/exception.h>
37#endif
38
40
41// #ifndef MODEL
42// #include "Model.h"
43// #endif
44
45namespace BALL
46{
47 class MolecularSimilarity;
48
49 namespace QSAR
50 {
51 typedef vector<double> Column;
52 typedef vector<Column> VMatrix;
53
56 {
57 public:
58
60
62
66 bool isDataCentered() const;
67
69 bool isResponseCentered() const;
70
75 vector<String>* readPropertyNames(String sd_file);
76
80 void readSDFile(const char* file);
81
87 void readSDFile(const char* file, std::multiset<int>& act, bool useExDesc=1, bool append=0, bool translate_class_labels=0);
88
89 void readSDFile(const char* file, std::set<String>& activity_names, bool useExDesc=1, bool append=0, bool translate_class_labels=0, bool calc_phychem_properties=1, bool calc_topological_properties=1);
90
93
96 void centerData(bool center_Y=0);
97
100
102 unsigned int getNoSubstances() const;
103
105 unsigned int getNoDescriptors() const;
106
114 void readCSVFile(const char* file, int no_y, bool xlabels, bool ylabels, const char* sep=",", bool appendDescriptors=0, bool translate_class_labels=0);
115
117 void manipulateY(std::vector<String> v);
118
122
125 void discretizeY(std::vector<double> thresholds);
126
127 void transformX(std::vector<String> v);
128
130 std::vector<QSARData*> partitionInputData(int p);
131
133 void saveToFile(string filename) const;
134
136 void readFromFile(string filename);
137
140 std::vector<QSARData*> generateExternalSet(double fraction) const;
141
146 std::vector<QSARData*> evenSplit(int no_test_splits, int current_test_split_id, int response_id=0) const;
147
149 std::vector<double>* getSubstance(int s) const;
150
152 std::vector<double>* getActivity(int s) const;
153
155 unsigned int getNoResponseVariables() const;
156
157 const std::vector<string>* getSubstanceNames() const;
158
160 bool checkforDiscreteY() const;
161
162
164 bool checkforDiscreteY(const char* file, std::multiset<int>& activity_IDs) const;
165
167 void setDataFolder(const char* folder);
168
171 void removeHighlyCorrelatedCompounds(double& compound_cor_threshold, double& feature_cor_threshold);
172
178 void getSimilarDescriptors(int descriptor_ID, double correlation, std::list<std::pair<uint,String> >& similar_descriptor_IDs) const;
180
181
182 protected:
183
187
192
194 void calculateTopologicalDescriptors(Molecule& mol, MolecularSimilarity& molsim, const std::map<String,int>& descriptor_map);
195
197 void setDescriptorNames(const Molecule& m, std::multiset<int>& activity_IDs, bool useExDesc=1, bool resize=1);
198
201 void removeInvalidDescriptors(std::multiset<int>& invalidDescriptors);
202
203 void removeInvalidSubstances(std::multiset<int>& inv);
204
206 void readMatrix(VMatrix& mat, std::ifstream& in, char seperator, unsigned int lines, unsigned int col);
207
210 void checkActivityIDs(std::multiset<int>& act, int no_properties);
211
214 void insertSubstance(const QSARData* source, int s, bool backtransformation=0);
215
217 void printMatrix(const VMatrix& mat, std::ostream& out) const;
219
225
228
231
234
236 vector<string> column_names_;
237
239 vector<string> substance_names_;
240
242 std::multiset<int> invalidDescriptors_;
243
244 std::multiset<int> invalidSubstances_;
245
247
249 std::map<String,int> class_names_;
251
252
253
256 friend class Validation;
257 friend class Model;
258 friend class FitModel;
259 friend class FeatureSelection;
260
261 };
262
263 }
264}
265
266#endif // QSARH
vector< Column > VMatrix
Definition QSARData.h:52
vector< double > Column
Definition QSARData.h:51
void calculateBALLDescriptors(Molecule &m)
vector< string > column_names_
Definition QSARData.h:236
void checkActivityIDs(std::multiset< int > &act, int no_properties)
void discretizeY(std::vector< double > thresholds)
std::vector< double > * getActivity(int s) const
std::multiset< int > invalidSubstances_
Definition QSARData.h:244
void removeHighlyCorrelatedCompounds(double &compound_cor_threshold, double &feature_cor_threshold)
unsigned int getNoResponseVariables() const
void setDataFolder(const char *folder)
vector< String > * readPropertyNames(String sd_file)
void manipulateY(String v)
void removeInvalidDescriptors(std::multiset< int > &invalidDescriptors)
std::map< String, int > class_names_
Definition QSARData.h:249
void printMatrix(const VMatrix &mat, std::ostream &out) const
void readSDFile(const char *file, std::set< String > &activity_names, bool useExDesc=1, bool append=0, bool translate_class_labels=0, bool calc_phychem_properties=1, bool calc_topological_properties=1)
bool checkforDiscreteY() const
void readCSVFile(const char *file, int no_y, bool xlabels, bool ylabels, const char *sep=",", bool appendDescriptors=0, bool translate_class_labels=0)
void readSDFile(const char *file, std::multiset< int > &act, bool useExDesc=1, bool append=0, bool translate_class_labels=0)
void transformX(std::vector< String > v)
std::vector< QSARData * > generateExternalSet(double fraction) const
void setDescriptorNames(const Molecule &m, std::multiset< int > &activity_IDs, bool useExDesc=1, bool resize=1)
std::vector< QSARData * > evenSplit(int no_test_splits, int current_test_split_id, int response_id=0) const
std::vector< double > * getSubstance(int s) const
VMatrix descriptor_matrix_
Definition QSARData.h:224
void readMatrix(VMatrix &mat, std::ifstream &in, char seperator, unsigned int lines, unsigned int col)
bool isResponseCentered() const
void insertSubstance(const QSARData *source, int s, bool backtransformation=0)
void readSDFile(const char *file)
VMatrix y_transformations_
Definition QSARData.h:233
void readFromFile(string filename)
std::multiset< int > invalidDescriptors_
Definition QSARData.h:242
void removeInvalidSubstances(std::multiset< int > &inv)
bool checkforDiscreteY(const char *file, std::multiset< int > &activity_IDs) const
void calculateTopologicalDescriptors(Molecule &mol, MolecularSimilarity &molsim, const std::map< String, int > &descriptor_map)
void getSimilarDescriptors(int descriptor_ID, double correlation, std::list< std::pair< uint, String > > &similar_descriptor_IDs) const
void saveToFile(string filename) const
vector< string > substance_names_
Definition QSARData.h:239
unsigned int getNoDescriptors() const
void centerData(bool center_Y=0)
unsigned int getNoSubstances() const
std::vector< QSARData * > partitionInputData(int p)
VMatrix descriptor_transformations_
Definition QSARData.h:230
const std::vector< string > * getSubstanceNames() const
bool isDataCentered() const
void manipulateY(std::vector< String > v)
#define BALL_EXPORT