#include <moleculeset.h>
Public Member Functions | |
MoleculeSet construction functions | |
MoleculeSet () | |
MoleculeSet (const MoleculeSet &aSet) | |
~MoleculeSet () | |
int | add (MoleculeSet *aSet) |
void | addMolecule (Molecule *aMolecule) |
Molecule * | addMoleculeCopy (Molecule *aMolecule) |
void | deleteAll () |
Input functions | |
int | addSD (string aFileName, bool genericAtomType=false, long beginMolecule=-1, long endMolecule=-1) |
int | addKCF (string aFileName, long beginMolecule=-1, long endMolecule=-1) |
Molecule * | addSingleMOL (string aMolFile, bool genericAtomType=false) |
Molecule * | addSingleKCF (string aMolFile) |
void | readMolDirectory (string aPath, bool genericAtomType=false, long beginMolecule=-1, long endMolecule=-1) |
void | readKcfDirectory (string dataDir, long beginMolecule=-1, long endMolecule=-1) |
void | addMutag (string aFileName, string rFileName="", uint numMolToRead=500) |
void | readActivityFile (string aFileName) |
void | readDescriptorFile (string aFileName, string separator=";") |
void | readGistClassifyFile (string aFileName) |
void | readGistActivityFile (string aFileName, string aDescriptor) |
void | readGram (string aFileName, vector< vector< double > > *gram) |
void | readGramNormal (string aFileName) |
void | readGramRaw (string aFileName) |
void | readPartialCharges (string fileName) |
Accessor functions | |
uint | numMolecules () |
Molecule * | operator[] (string aName) throw ( CError ) |
Molecule * | getMolByName (string aName) throw ( CError ) |
Molecule * | operator[] (int anInd) throw ( CError ) |
Molecule * | getMolByIndex (int anInd) throw ( CError ) |
long | getPossibleValuesInIntDescriptor (string aDescriptorName, vector< int > *p) |
double | getPq () |
int | getConvergenceCondition () |
bool | nameExists (string aName) |
bool | hasActivity () |
void | setIntDescriptor (string aName, int aValue) |
void | setUniqueMorganIndices () |
void | setMorganLabels (int anOrder) |
void | setIntDescriptor (string aLabel, string aMolecule, int aValue) |
void | setActivity (string aMolecule, float aValue) |
void | setKashimaKernelParam (double aPq, int aConvergenceCondition, bool skipSkeleton=false) |
void | setComparisonSet (MoleculeSet *) |
void | setMorganChargesLabels (double threshold) |
MoleculeSet manipulation functions | |
void | selectAll () |
void | unSelectAll () |
int | select (vector< string > *aSubset) |
int | unSelect (vector< string > *aSubset) |
long | selectByFloatDescriptor (string aName, float aValue) |
long | selectByIntDescriptor (string aName, int aValue) |
long | selectByActivity (float aValue) |
long | selectHasActivity () |
int | selectByMW (float minmw, float maxmw=-1, bool addMolecularDescriptor=false) |
int | selectByNumAtoms (float minNumAtoms, float maxNumAtoms=-1, bool addMolecularDescriptor=false) |
void | sortByDescriptor (string aDescriptorName, int aDescriptorType, bool reverse=false) |
void | sortByDescriptor (string aDescriptorName, bool reverse=false) |
void | sortByMW () |
void | sortByNumAtoms () |
void | binClassifyFromDescriptor (string descriptorName, float value, bool smallerOrEqual=true) |
Molecule * | findFirstMoleculeWithName (string aName) throw ( CError ) |
void | removeDuplicates () |
void | deleteHiddenAtoms () |
void | hideHydrogens () |
void | hideSalts (string aReportFileName="") |
void | restoreHiddenAtoms () |
void | addFragmentsToSet (Molecule *aMol, int minAtoms=1) |
void | pushFragments (Molecule *aMol, int minAtoms=1) |
double | diversityBaryMean () |
vector< string > | atomsLabelsListing () |
vector< string > | atomsSymbolsListing () |
vector< int > | bondsListing () |
void | noTottersTransform () |
void | threeDtransform (int nBins, double distMin, double distMax) |
void | minMaxDistances (double *distMin, double *distMax) |
Kernels and Gram matrices related functions | |
void | gramCompute (double aPq, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int aParameter=1000, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false) |
void | gramCompute (MoleculeSet *anotherSet, double aPq, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int aParameter=1000, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false) |
void | gramCompute (double aPq, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int parameter1=1000, int parameter2=1, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false) |
void | gramCompute (MoleculeSet *anotherSet, double aPq, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int parameter1=1000, int parameter2=1, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false) |
void | gramCompute3D (double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(float, float, float), float), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(float, float, float), float edgeKernelparameter, bool silentMode) |
void | gramCompute3D (MoleculeSet *anotherSet, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(float, float, float), float), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(float, float, float), float edgeKernelparameter, bool silentMode) |
void | kernelCompute (Molecule *aMol, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), vector< double > *resultsRaw, vector< double > *resultsNormal, int convergenceCondition=1000, int parameter2=1, bool silentMode=false) |
void | resetGramMatrix () |
void | resetSelfKernels () |
void | initializeGram (double value) |
void | initializeSelfKernel (double value) |
void | normalizeGram () |
void | normalizeGram_raw () |
void | normalizeTanimoto () |
void | normalizeTanimoto_raw () |
void | normalizeTanimotoMinMax () |
void | addToGram (int row, int col, double value) |
void | addToGramNormal (int row, int col, double value) |
void | substractToGram (int row, int col, double value) |
double | getGramValue (int row, int col) |
Output functions | |
void | writeActivityFile (string aFilename, bool addActivityExtension=true, string activityDescriptor=ACTIVITY) |
void | writeGramMatrix (string aFileName, bool normal=false, bool self=false, bool silentMode=false) |
void | writeSelfKernelList (string aFilename, bool silentMode=false) |
void | writeSD (string aFileName, bool selectedOnly=false) |
void | writeSubsetSD (string aFileName, vector< string > *anOrder) |
void | writeSubsetKCF (string aFileName, vector< string > *anOrder) |
void | writeKCF (string aFileName, bool selectedOnly=false) |
string | toString (bool selectedOnly=false) |
string | toStringShort () |
string | toStringLong () |
void | describe (bool selectedOnly=false) |
void | describeShort () |
void | describeLong () |
void | writeDescriptors (string aFileName, bool selectedOnly=false) throw ( CError ) |
long | writeMolToDir (string aDirName, bool selectedOnly=false) |
long | writeDotsToDir (string aDirectory, bool selectedOnly=false, bool perretLabels=false) |
Protected Member Functions | |
void | setSortDescriptor (string aName, int aType, bool reverse=false) throw ( CError ) |
string | getSortDescriptorName () |
Protected Attributes | |
MoleculeSet * | comparisonSet |
vector< vector< double > > * | gram |
vector< vector< double > > * | gramNormal |
bool | gramCalculated |
double | pq |
int | convergenceCondition |
int | subsetStart |
int | subsetSize |
bool | activitySet |
It is a set in the mathematical way in the sense that no two molecule should have the same name. However a MoleculeSet can contain two identical graphs.
WARNING: No checks are made for throwing error in case there are two molecules with the same name, except when calling the [] operator.
|
class constructor. |
|
destructor for the MoleculeSet. deletes gram and gramNormal. |
|
adds all molecules in aSet to the current set. |
|
NOT DOCUMENTED |
|
reads an KCF file and returns the number of created molecules. |
|
adds a molecule to the set. molecules added to the molecule set are not deleted when the set is deleted. use the deleteAllMolecule() function to do so (CAUTION if you reference these molecules from elsewhere). |
|
adds a copy of the molecule in argument. used by add() to merge two datasets. |
|
reads the Mutag dataset. Atoms and bonds are read from aFilename while the biological activity of the molecule is read from file rFilename. numMolToRead allows to specify the number of first molecules to read.
|
|
adds an sd file content and returns the number of created molecules. if genericAtomType is true, then the atoms are not read from the periodic table, but are created based only on the label provided. beginMolecule and endMolecule specify the index of the first and last molecule to include in the training set (starting count from 0). values of -1 (default) means no limit. |
|
creates a new molecule in the dataset and reads its definition from a Kcf file. |
|
creates a new molecule in the dataset and reads its definition from a MDL MOL file. |
|
adds a value to a Gram matrix entry. |
|
adds a value to a Gram normal matrix entry. |
|
lists the different kinds of atoms present in the set based on their Morgan labels. |
|
lists the different kinds of atoms present in the set based on their symbols. |
|
sets the activity of molecules in the set according to the value of a descriptor and a value. if true is passed as third argument (default) then descriptorName <= value are considered positive. if false is passed as third argument then molecules with descriptorName >= value are considered positive. if a descriptor is missing then the molecule is left without activity. |
|
lists the different kinds of bonds present in the set. |
|
deletes all molecules in dataset (memory deallocation) and clears the vector containing the pointers. |
|
deletes all hidden atoms in all molecules of the set. |
|
writes a description of the moleculeSet to cout. |
|
writes a long description of the moleculeSet to cout. |
|
writes a short description of the moleculeSet to cout. |
|
returns the mean distance of all molecules to the moleculeSet barycenter. |
|
returns a pointer to the molecule with name aName in the MoleculeSet. |
|
returns the convergence condition currently in use in the MoleculeSet for the calculation of random walk graph Kernel. |
|
returns a Gram matrix entry. |
|
returns a pointer to the molecule of index anInd in the set. |
|
returns a pointer to the first molecule with name aName in the set. Throws a CError exception if no molecule with that name exists in the set. WARNING: if more than one molecule have the same name than a CError is also thrown. |
|
fills a vector of int with the values an int descriptor can take among all molecules and returns the number of such values. |
|
returns the stop probability currently in use in the MoleculeSet for the calculation of random walk graph Kernel. |
|
returns the name of the sorting descriptor. |
|
"TRUE" gramCompute() function, i.e., the one called by EVERY OTHER GRAMCOMPUTE FUNCTION. |
|
calculates the gram matrix of similarity using the marginalized graph kernel for all molecules in the MoleculeSet using a specified graph, atom and bond kernel. ff aReportFilename is specified a report is saved in that file. |
|
calculates the gram matrix of similarity using the marginalized graph kernel for all molecules in the MoleculeSet using a specified graph, atom and bond kernel. ff aReportFilename is specified a report is saved in that file. |
|
calculates the gram matrix of similarity using the marginalized graph kernel for all molecules in the MoleculeSet using a specified graph, atom and bond kernel. ff aReportFilename is specified a report is saved in that file. |
|
"TRUE" gramCompute3D function, i.e., the one called by EVERY OTHER GRAMCOMPUTE3D FUNCTION. |
|
3D kernel computation. NOTE : AtomKernel/BondKernel prototyes different from gramCompute. |
|
returns true if the activity of molecules in the set was set using readActivityFile. WARNING: no check is made to verify if all molecules were set. |
|
hides all hydrogens in all molecules of the set. |
|
hides all but the largest connected graph in all molecules of the set. |
|
initializes every element gram matrix to the given value. |
|
initializes the self kernels to the given value. |
|
computes all kernel values between a molecule aMol and all compounds in the set. |
|
computes the Euclidian distance between the XYZ coordinates of two atoms. |
|
returns true if a molecule with name aName exists in the set, false otherwise. |
|
normalizes the gram matrix, i.e. compute gramNormal. |
|
normalizes the gram matrix, i.e. compute gramNormal. WARNING: NORMALIZATION BASED ON THE RAW GRAM MATRIX (instead of self-kernels) |
|
normalizes the Gram matrix according to the Tanimoto kernel definition (Ralaivola et al. 2005). |
|
normalizes the Gram matrix according to the Tanimoto kernel definition, BASED ON THE RAW GRAM MATRIX |
|
normalizes the Gram matrix according to the 'min-max' Tanimoto kernel definition (Ralaivola et al. 2005). |
|
transforms the molecular graphs into graph preventing tottering paths (see (Mahe et al., 2004)). |
|
returns the number of molecules in the MoleculeSet. |
|
returns a pointer to the molecule of index anInd in the set. |
|
returns a pointer to the first molecule with name aName in the set. Throws a CError exception if no molecule with that name exists in the set. WARNING: if more than one molecule have the same name than a CError is also thrown. |
|
NOT DOCUMENTED |
|
reads the activity of molecules from an activity file. expects a file containing : label [tab] class molname [tab] 1 for active molecules molname [tab] -1 for inactive molecules molname is the fileName of the Mol file. |
|
reads a descriptor file. the first line in the file indicates the Descriptor comment text. the second line in the file indicates the Descriptor units. the third line in the file indicates the Descriptor types. the forth line indicates the Descriptor names. data start at fifth line. the first column should contain the same molecule names as those obtained by getName() for the molecules in the dataset. example: Full name;Log octanol partition coefficient;boiling point;special comment NA;NA;K;NA string;float;float;string name;logP;Bp;acomment ethane;10;200;junk data WARNING: does not check for duplicate name entries in the descriptor file. WARNING: does not add anything to the molecules not contained in the descriptor file. |
|
reads the content of a gist activity file and adds the information to descriptor aDescriptor for each molecule matching molecule names. |
|
reads the content of a file produced by gist-classify and adds the information matching molecule names in the set as descriptors gistname, gistclass, and gistdiscr. |
|
reads a gram matrix matching the dataset. emits an error if the dimension of the read gram matrix read does not match the number of compounds in the dataset. |
|
reads a normalized gram matrix matching the dataset. |
|
reads a raw gram matrix matching the dataset. |
|
loads all .kcf files in a directory and adds the molecules to the set. |
|
loads all .mol files in a directory and adds the molecules to the set. |
|
reads the partial charges associated to the molecule set from an input file. NOTE : the input file has one line per molecule of the molecule set, and within each line, the values of the partial charges are separated by ';'. |
|
removes duplicate entries in the set. |
|
erases the Gram Matrix and sets the gramCalculated flag to false. |
|
resets all calculated selfkernels for the molecules in the dataset. |
|
restores the hidden atoms for all molecules. |
|
selects the molecules with the names provided as arguments in a vector of string. unselect all others. returns the number of selected molecules. |
|
selects all molecules in the dataset. WARNING the selection status is stored in the Molecule class. Therefore molecules in other datasets will be selected too... |
|
selects all molecules having activity equal to aValue. |
|
selects all molecules having float descriptor aName with value aValue, unselects all others. |
|
selects all molecules having int descriptor aName with value aValue, unselects all others. |
|
selects all molecules in the dataset with mw >= minmw and <= maxmw. if maxmw = -1 then there is no maximum limit. if addMolecularDescriptor is true then a floatDescriptor with label mw is added to the molecule. |
|
selects all molecules in the dataset with number of atoms >= numAtoms and <= numAtoms. if maxNumAtoms = -1 then there is no maximum limit. if addMolecularDescriptor is true then a intDescriptor with label numAtoms is added to the molecule. |
|
selects all molecules which have a defined activity status. |
|
sets the avtivity of aMolecule to aValue. |
|
sets the comparison set of the moleculeSet. |
|
sets the value of intDescriptor aLabel of molecule aLabel to aValue. |
|
sets integer aName to aValue for all compounds in the dataset. |
|
sets the start, stop (aPq) and transition probabilities. sets the start, stop (aPq) and transition probabilities for all molecules according to the article by Kashima et al. using the setKashimaKernelProb(aFloat) function of the Molecule class. WARNING: a call to this function erases the gram Matrix and it will be recalculated. |
|
sets the 'morgan charges' labels of the atoms, i.e., the concatenation of the Morgan labels of the atoms and the (+/-) sign of their partial charges. |
|
sets the morganLabels of each molecule to the anOrder iteration of the Morgan index calculation process. |
|
sets the type and name of descriptor to be used when sorting molecules. if reverse == true the sorting will be in descending order. |
|
sets the uniqueMorganIndex of each atom to the Morgan index having the maximum of different connectivity values for the molecule. |
|
sorts the molecule collection according to the molecule Descriptor descriptorName. descriptor type will be read from descriptor name if name is of type .integer, or *******.float, set to string otherwise. if reverse = true then the sorting is in decreasing order. |
|
sorts the molecule collection according to the molecule Descriptor descriptorName of type descriptorType. if reverse = true then the sorting is in decreasing order. |
|
sorts all compounds in the set by Molecular weight. |
|
sorts all compounds in the set by their number of atoms. |
|
substracts a value to a Grammatrix entry. |
|
transforms the molecular graphs into '3D complete graphs' with edges labeled by inter atomic distances in order to compute the pharmacophore kernel (see (Mahe et al., 2006)). |
|
returns a string description of the MoleculeSet. |
|
returns a long string description of the MoleculeSet. |
|
returns a short string description of the MoleculeSet (number of molecules in the set and short description of each molecule). |
|
unselects the molecules with the names provided as arguments in a vector of string. returns the number of selected molecules. |
|
unselects all molecules in the dataset. WARNING the selection status is stored in the Molecule class. Therefore molecules in other datasets will be unselected too... |
|
writes a file with the biological activity of molecules in a format compatible with GIST. |
|
writes a ';' separated file containing all descriptors for all molecules. |
|
writes a dot file for all molecule in the set to aDirectory. |
|
writes the gram matrix.
|
|
writes a KCF file with the whole moleculeSet. |
|
writes a mol file for each molecule in the set to aDirName. if selectedOnly == true then only selected molecules are written. |
|
writes a MDL structure data (SD) file with the molecules in the moleculeSet setting selectedOnly to true outputs only the selectwed compounds. a pointer to a vector of strings containing the molecule names for ordered output can be specified. |
|
writes all self kernel values in a file. |
|
writes a KCF file with the molecules in the moleculeSet matching the names given as argument. |
|
writes a MDL structure data (SD) file with the molecules in the moleculeSet matching the names given as argument. |
|
stores if the activity of the molecules in the set was specified. |
|
comparison set of the molecule set. In test set mode, the comparison set is another set of compounds. In self set mode, the comparison set is the molecule set itself. |
|
convergence condition used in the calculation of the Kashima Kernel. |
|
Gram matrix. |
|
contains true if the gram matrix was evaluated for the current MoleculeSet, false otherwise. note that setKashimaKernelProb sets this flag to false. {Addition of a new molecule to the set when the flag is set to true induces the calculation of a new line to the gram matrix DEPRECATED} |
|
normal Gram matrix. |
|
kashima Stop probability for the moleculeSet used to set kashimaProb in added molecules. |
|
subset size. |
|
subset start. |