MoleculeSet Class Reference

#include <moleculeset.h>

List of all members.

Public Member Functions

MoleculeSet construction functions

MoleculeSet ()

MoleculeSet (const MoleculeSet &aSet)

~MoleculeSet ()

int add (MoleculeSet *aSet)

void addMolecule (Molecule *aMolecule)

Molecule * addMoleculeCopy (Molecule *aMolecule)

void deleteAll ()

Input functions

int addSD (string aFileName, bool genericAtomType=false, long beginMolecule=-1, long endMolecule=-1)

int addKCF (string aFileName, long beginMolecule=-1, long endMolecule=-1)

Molecule * addSingleMOL (string aMolFile, bool genericAtomType=false)

Molecule * addSingleKCF (string aMolFile)

void readMolDirectory (string aPath, bool genericAtomType=false, long beginMolecule=-1, long endMolecule=-1)

void readKcfDirectory (string dataDir, long beginMolecule=-1, long endMolecule=-1)

void addMutag (string aFileName, string rFileName="", uint numMolToRead=500)

void readActivityFile (string aFileName)

void readDescriptorFile (string aFileName, string separator=";")

void readGistClassifyFile (string aFileName)

void readGistActivityFile (string aFileName, string aDescriptor)

void readGram (string aFileName, vector< vector< double > > *gram)

void readGramNormal (string aFileName)

void readGramRaw (string aFileName)

void readPartialCharges (string fileName)

Accessor functions

uint numMolecules ()

Molecule * operator[] (string aName) throw ( CError )

Molecule * getMolByName (string aName) throw ( CError )

Molecule * operator[] (int anInd) throw ( CError )

Molecule * getMolByIndex (int anInd) throw ( CError )

long getPossibleValuesInIntDescriptor (string aDescriptorName, vector< int > *p)

double getPq ()

int getConvergenceCondition ()

bool nameExists (string aName)

bool hasActivity ()

void setIntDescriptor (string aName, int aValue)

void setUniqueMorganIndices ()

void setMorganLabels (int anOrder)

void setIntDescriptor (string aLabel, string aMolecule, int aValue)

void setActivity (string aMolecule, float aValue)

void setKashimaKernelParam (double aPq, int aConvergenceCondition, bool skipSkeleton=false)

void setComparisonSet (MoleculeSet *)

void setMorganChargesLabels (double threshold)

MoleculeSet manipulation functions

void selectAll ()

void unSelectAll ()

int select (vector< string > *aSubset)

int unSelect (vector< string > *aSubset)

long selectByFloatDescriptor (string aName, float aValue)

long selectByIntDescriptor (string aName, int aValue)

long selectByActivity (float aValue)

long selectHasActivity ()

int selectByMW (float minmw, float maxmw=-1, bool addMolecularDescriptor=false)

int selectByNumAtoms (float minNumAtoms, float maxNumAtoms=-1, bool addMolecularDescriptor=false)

void sortByDescriptor (string aDescriptorName, int aDescriptorType, bool reverse=false)

void sortByDescriptor (string aDescriptorName, bool reverse=false)

void sortByMW ()

void sortByNumAtoms ()

void binClassifyFromDescriptor (string descriptorName, float value, bool smallerOrEqual=true)

Molecule * findFirstMoleculeWithName (string aName) throw ( CError )

void removeDuplicates ()

void deleteHiddenAtoms ()

void hideHydrogens ()

void hideSalts (string aReportFileName="")

void restoreHiddenAtoms ()

void addFragmentsToSet (Molecule *aMol, int minAtoms=1)

void pushFragments (Molecule *aMol, int minAtoms=1)

double diversityBaryMean ()

vector< string > atomsLabelsListing ()

vector< string > atomsSymbolsListing ()

vector< int > bondsListing ()

void noTottersTransform ()

void threeDtransform (int nBins, double distMin, double distMax)

void minMaxDistances (double *distMin, double *distMax)

Kernels and Gram matrices related functions

void gramCompute (double aPq, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int aParameter=1000, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false)

void gramCompute (MoleculeSet *anotherSet, double aPq, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int aParameter=1000, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false)

void gramCompute (double aPq, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int parameter1=1000, int parameter2=1, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false)

void gramCompute (MoleculeSet *anotherSet, double aPq, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int parameter1=1000, int parameter2=1, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false)

void gramCompute3D (double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(float, float, float), float), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(float, float, float), float edgeKernelparameter, bool silentMode)

void gramCompute3D (MoleculeSet *anotherSet, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(float, float, float), float), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(float, float, float), float edgeKernelparameter, bool silentMode)

void kernelCompute (Molecule *aMol, double(*pt2GraphKernel)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int), double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), vector< double > *resultsRaw, vector< double > *resultsNormal, int convergenceCondition=1000, int parameter2=1, bool silentMode=false)

void resetGramMatrix ()

void resetSelfKernels ()

void initializeGram (double value)

void initializeSelfKernel (double value)

void normalizeGram ()

void normalizeGram_raw ()

void normalizeTanimoto ()

void normalizeTanimoto_raw ()

void normalizeTanimotoMinMax ()

void addToGram (int row, int col, double value)

void addToGramNormal (int row, int col, double value)

void substractToGram (int row, int col, double value)

double getGramValue (int row, int col)

Output functions

void writeActivityFile (string aFilename, bool addActivityExtension=true, string activityDescriptor=ACTIVITY)

void writeGramMatrix (string aFileName, bool normal=false, bool self=false, bool silentMode=false)

void writeSelfKernelList (string aFilename, bool silentMode=false)

void writeSD (string aFileName, bool selectedOnly=false)

void writeSubsetSD (string aFileName, vector< string > *anOrder)

void writeSubsetKCF (string aFileName, vector< string > *anOrder)

void writeKCF (string aFileName, bool selectedOnly=false)

string toString (bool selectedOnly=false)

string toStringShort ()

string toStringLong ()

void describe (bool selectedOnly=false)

void describeShort ()

void describeLong ()

void writeDescriptors (string aFileName, bool selectedOnly=false) throw ( CError )

long writeMolToDir (string aDirName, bool selectedOnly=false)

long writeDotsToDir (string aDirectory, bool selectedOnly=false, bool perretLabels=false)

Protected Member Functions

void setSortDescriptor (string aName, int aType, bool reverse=false) throw ( CError )

string getSortDescriptorName ()

Protected Attributes

MoleculeSet * comparisonSet

vector< vector< double > > * gram

vector< vector< double > > * gramNormal

bool gramCalculated

double pq

int convergenceCondition

int subsetStart

int subsetSize

bool activitySet

Detailed Description

Set of molecules on which virtual experiments may be performed

Author:: Dr Jean-Luc Perret (luc@kuicr.kyoto-u.ac.jp), Kyoto University, Japan

Version:: 0.3

Date:: 17 Jan 2004

CLASS NAME: MoleculeSet FOR: SNSF SPONSORED PROJECT PURPOSE: This class implements the notion of molecule set

It is a set in the mathematical way in the sense that no two molecule should have the same name. However a MoleculeSet can contain two identical graphs.

WARNING: No checks are made for throwing error in case there are two molecules with the same name, except when calling the [] operator.

Examples:: moleculeset_example.cpp.

Constructor & Destructor Documentation

MoleculeSet::MoleculeSet ( )

class constructor.

MoleculeSet::~MoleculeSet ( )

destructor for the MoleculeSet. deletes gram and gramNormal.

Member Function Documentation

int MoleculeSet::add ( MoleculeSet * aSet )

adds all molecules in aSet to the current set.

void MoleculeSet::addFragmentsToSet ( Molecule * aMol,

int minAtoms = 1

)

NOT DOCUMENTED

int MoleculeSet::addKCF ( string aFileName,

long beginMolecule = -1,

long endMolecule = -1

)

reads an KCF file and returns the number of created molecules.

void MoleculeSet::addMolecule ( Molecule * aMolecule )

adds a molecule to the set. molecules added to the molecule set are not deleted when the set is deleted. use the deleteAllMolecule() function to do so (CAUTION if you reference these molecules from elsewhere).

Molecule* MoleculeSet::addMoleculeCopy ( Molecule * aMolecule )

adds a copy of the molecule in argument. used by add() to merge two datasets.

void MoleculeSet::addMutag ( string aFileName,

string rFileName = "",

uint numMolToRead = 500

)

reads the Mutag dataset. Atoms and bonds are read from aFilename while the biological activity of the molecule is read from file rFilename. numMolToRead allows to specify the number of first molecules to read.
Examples:
moleculeset_example.cpp.

int MoleculeSet::addSD ( string aFileName,

bool genericAtomType = false,

long beginMolecule = -1,

long endMolecule = -1

)

adds an sd file content and returns the number of created molecules. if genericAtomType is true, then the atoms are not read from the periodic table, but are created based only on the label provided.
beginMolecule and endMolecule specify the index of the first and last molecule to include in the training set (starting count from 0). values of -1 (default) means no limit.

Molecule* MoleculeSet::addSingleKCF ( string aMolFile )

creates a new molecule in the dataset and reads its definition from a Kcf file.

Molecule* MoleculeSet::addSingleMOL ( string aMolFile,

bool genericAtomType = false

)

creates a new molecule in the dataset and reads its definition from a MDL MOL file.

void MoleculeSet::addToGram ( int row,

int col,

double value

)

adds a value to a Gram matrix entry.

void MoleculeSet::addToGramNormal ( int row,

int col,

double value

)

adds a value to a Gram normal matrix entry.

vector<string> MoleculeSet::atomsLabelsListing ( )

lists the different kinds of atoms present in the set based on their Morgan labels.

vector<string> MoleculeSet::atomsSymbolsListing ( )

lists the different kinds of atoms present in the set based on their symbols.

void MoleculeSet::binClassifyFromDescriptor ( string descriptorName,

float value,

bool smallerOrEqual = true

)

sets the activity of molecules in the set according to the value of a descriptor and a value. if true is passed as third argument (default) then descriptorName <= value are considered positive. if false is passed as third argument then molecules with descriptorName >= value are considered positive. if a descriptor is missing then the molecule is left without activity.

vector<int> MoleculeSet::bondsListing ( )

lists the different kinds of bonds present in the set.

void MoleculeSet::deleteAll ( )

deletes all molecules in dataset (memory deallocation) and clears the vector containing the pointers.

void MoleculeSet::deleteHiddenAtoms ( )

deletes all hidden atoms in all molecules of the set.

void MoleculeSet::describe ( bool selectedOnly = false )

writes a description of the moleculeSet to cout.

void MoleculeSet::describeLong ( )

writes a long description of the moleculeSet to cout.

void MoleculeSet::describeShort ( )

writes a short description of the moleculeSet to cout.

double MoleculeSet::diversityBaryMean ( )

returns the mean distance of all molecules to the moleculeSet barycenter.

Molecule* MoleculeSet::findFirstMoleculeWithName ( string aName ) throw ( CError )

returns a pointer to the molecule with name aName in the MoleculeSet.

int MoleculeSet::getConvergenceCondition ( ) [inline]

returns the convergence condition currently in use in the MoleculeSet for the calculation of random walk graph Kernel.

double MoleculeSet::getGramValue ( int row,

int col

)

returns a Gram matrix entry.

Molecule* MoleculeSet::getMolByIndex ( int anInd ) throw ( CError )

returns a pointer to the molecule of index anInd in the set.

Molecule* MoleculeSet::getMolByName ( string aName ) throw ( CError )

returns a pointer to the first molecule with name aName in the set. Throws a CError exception if no molecule with that name exists in the set. WARNING: if more than one molecule have the same name than a CError is also thrown.

long MoleculeSet::getPossibleValuesInIntDescriptor ( string aDescriptorName,

vector< int > * p

)

fills a vector of int with the values an int descriptor can take among all molecules and returns the number of such values.

double MoleculeSet::getPq ( ) [inline]

returns the stop probability currently in use in the MoleculeSet for the calculation of random walk graph Kernel.

string MoleculeSet::getSortDescriptorName ( ) [protected]

returns the name of the sorting descriptor.

void MoleculeSet::gramCompute ( MoleculeSet * anotherSet,

double aPq,

double(*)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int) pt2GraphKernel,

double(*)(Atom *, Atom *) pt2AtomKernel,

double(*)(Bond *, Bond *) pt2BondKernel,

int parameter1 = 1000,

int parameter2 = 1,

string aReportFileName = "",

int nbThreadsWanted = 1,

bool silentMode = false,

bool filterTotters = false

)

"TRUE" gramCompute() function, i.e., the one called by EVERY OTHER GRAMCOMPUTE FUNCTION.

void MoleculeSet::gramCompute ( double aPq,

double(*)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int) pt2GraphKernel,

double(*)(Atom *, Atom *) pt2AtomKernel,

double(*)(Bond *, Bond *) pt2BondKernel,

int parameter1 = 1000,

int parameter2 = 1,

string aReportFileName = "",

int nbThreadsWanted = 1,

bool silentMode = false,

bool filterTotters = false

)

calculates the gram matrix of similarity using the marginalized graph kernel for all molecules in the MoleculeSet using a specified graph, atom and bond kernel. ff aReportFilename is specified a report is saved in that file.

void MoleculeSet::gramCompute ( MoleculeSet * anotherSet,

double aPq,

double(*)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int) pt2GraphKernel,

double(*)(Atom *, Atom *) pt2AtomKernel,

double(*)(Bond *, Bond *) pt2BondKernel,

int aParameter = 1000,

string aReportFileName = "",

int nbThreadsWanted = 1,

bool silentMode = false,

bool filterTotters = false

)

calculates the gram matrix of similarity using the marginalized graph kernel for all molecules in the MoleculeSet using a specified graph, atom and bond kernel. ff aReportFilename is specified a report is saved in that file.

void MoleculeSet::gramCompute ( double aPq,

double(*)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int) pt2GraphKernel,

double(*)(Atom *, Atom *) pt2AtomKernel,

double(*)(Bond *, Bond *) pt2BondKernel,

int aParameter = 1000,

string aReportFileName = "",

int nbThreadsWanted = 1,

bool silentMode = false,

bool filterTotters = false

)

calculates the gram matrix of similarity using the marginalized graph kernel for all molecules in the MoleculeSet using a specified graph, atom and bond kernel. ff aReportFilename is specified a report is saved in that file.

void MoleculeSet::gramCompute3D ( MoleculeSet * anotherSet,

double(*)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(float, float, float), float) pt2GraphKernel,

double(*)(Atom *, Atom *) pt2AtomKernel,

double(*)(float, float, float) pt2BondKernel,

float edgeKernelparameter,

bool silentMode

)

"TRUE" gramCompute3D function, i.e., the one called by EVERY OTHER GRAMCOMPUTE3D FUNCTION.

void MoleculeSet::gramCompute3D ( double(*)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(float, float, float), float) pt2GraphKernel,

double(*)(Atom *, Atom *) pt2AtomKernel,

double(*)(float, float, float) pt2BondKernel,

float edgeKernelparameter,

bool silentMode

)

3D kernel computation. NOTE : AtomKernel/BondKernel prototyes different from gramCompute.

bool MoleculeSet::hasActivity ( ) [inline]

returns true if the activity of molecules in the set was set using readActivityFile. WARNING: no check is made to verify if all molecules were set.

void MoleculeSet::hideHydrogens ( )

hides all hydrogens in all molecules of the set.

void MoleculeSet::hideSalts ( string aReportFileName = "" )

hides all but the largest connected graph in all molecules of the set.

void MoleculeSet::initializeGram ( double value )

initializes every element gram matrix to the given value.

void MoleculeSet::initializeSelfKernel ( double value )

initializes the self kernels to the given value.

void MoleculeSet::kernelCompute ( Molecule * aMol,

double(*)(Molecule *mol1, Molecule *mol2, double(*pt2AtomKernel)(Atom *, Atom *), double(*pt2BondKernel)(Bond *, Bond *), int, int) pt2GraphKernel,

double(*)(Atom *, Atom *) pt2AtomKernel,

double(*)(Bond *, Bond *) pt2BondKernel,

vector< double > * resultsRaw,

vector< double > * resultsNormal,

int convergenceCondition = 1000,

int parameter2 = 1,

bool silentMode = false

)

computes all kernel values between a molecule aMol and all compounds in the set.

void MoleculeSet::minMaxDistances ( double * distMin,

double * distMax

)

computes the Euclidian distance between the XYZ coordinates of two atoms.

bool MoleculeSet::nameExists ( string aName )

returns true if a molecule with name aName exists in the set, false otherwise.

void MoleculeSet::normalizeGram ( )

normalizes the gram matrix, i.e. compute gramNormal.

void MoleculeSet::normalizeGram_raw ( )

normalizes the gram matrix, i.e. compute gramNormal. WARNING: NORMALIZATION BASED ON THE RAW GRAM MATRIX (instead of self-kernels)

void MoleculeSet::normalizeTanimoto ( )

normalizes the Gram matrix according to the Tanimoto kernel definition (Ralaivola et al. 2005).

void MoleculeSet::normalizeTanimoto_raw ( )

normalizes the Gram matrix according to the Tanimoto kernel definition, BASED ON THE RAW GRAM MATRIX

void MoleculeSet::normalizeTanimotoMinMax ( )

normalizes the Gram matrix according to the 'min-max' Tanimoto kernel definition (Ralaivola et al. 2005).

void MoleculeSet::noTottersTransform ( )

transforms the molecular graphs into graph preventing tottering paths (see (Mahe et al., 2004)).

uint MoleculeSet::numMolecules ( ) [inline]

returns the number of molecules in the MoleculeSet.

Molecule* MoleculeSet::operator[] ( int anInd ) throw ( CError )

returns a pointer to the molecule of index anInd in the set.

Molecule* MoleculeSet::operator[] ( string aName ) throw ( CError )

returns a pointer to the first molecule with name aName in the set. Throws a CError exception if no molecule with that name exists in the set. WARNING: if more than one molecule have the same name than a CError is also thrown.

void MoleculeSet::pushFragments ( Molecule * aMol,

int minAtoms = 1

)

NOT DOCUMENTED

void MoleculeSet::readActivityFile ( string aFileName )

reads the activity of molecules from an activity file. expects a file containing : label [tab] class molname [tab] 1 for active molecules molname [tab] -1 for inactive molecules molname is the fileName of the Mol file.

void MoleculeSet::readDescriptorFile ( string aFileName,

string separator = ";"

)

reads a descriptor file. the first line in the file indicates the Descriptor comment text. the second line in the file indicates the Descriptor units. the third line in the file indicates the Descriptor types. the forth line indicates the Descriptor names. data start at fifth line.
the first column should contain the same molecule names as those obtained by getName() for the molecules in the dataset.
example:
Full name;Log octanol partition coefficient;boiling point;special comment NA;NA;K;NA string;float;float;string name;logP;Bp;acomment ethane;10;200;junk data
WARNING: does not check for duplicate name entries in the descriptor file. WARNING: does not add anything to the molecules not contained in the descriptor file.

void MoleculeSet::readGistActivityFile ( string aFileName,

string aDescriptor

)

reads the content of a gist activity file and adds the information to descriptor aDescriptor for each molecule matching molecule names.

void MoleculeSet::readGistClassifyFile ( string aFileName )

reads the content of a file produced by gist-classify and adds the information matching molecule names in the set as descriptors gistname, gistclass, and gistdiscr.

void MoleculeSet::readGram ( string aFileName,

vector< vector< double > > * gram

)

reads a gram matrix matching the dataset. emits an error if the dimension of the read gram matrix read does not match the number of compounds in the dataset.

void MoleculeSet::readGramNormal ( string aFileName )

reads a normalized gram matrix matching the dataset.

void MoleculeSet::readGramRaw ( string aFileName )

reads a raw gram matrix matching the dataset.

void MoleculeSet::readKcfDirectory ( string dataDir,

long beginMolecule = -1,

long endMolecule = -1

)

loads all .kcf files in a directory and adds the molecules to the set.

void MoleculeSet::readMolDirectory ( string aPath,

bool genericAtomType = false,

long beginMolecule = -1,

long endMolecule = -1

)

loads all .mol files in a directory and adds the molecules to the set.

void MoleculeSet::readPartialCharges ( string fileName )

reads the partial charges associated to the molecule set from an input file. NOTE : the input file has one line per molecule of the molecule set, and within each line, the values of the partial charges are separated by ';'.

void MoleculeSet::removeDuplicates ( )

removes duplicate entries in the set.

void MoleculeSet::resetGramMatrix ( )

erases the Gram Matrix and sets the gramCalculated flag to false.

void MoleculeSet::resetSelfKernels ( )

resets all calculated selfkernels for the molecules in the dataset.

void MoleculeSet::restoreHiddenAtoms ( )

restores the hidden atoms for all molecules.

int MoleculeSet::select ( vector< string > * aSubset )

selects the molecules with the names provided as arguments in a vector of string. unselect all others. returns the number of selected molecules.

void MoleculeSet::selectAll ( )

selects all molecules in the dataset. WARNING the selection status is stored in the Molecule class. Therefore molecules in other datasets will be selected too...

long MoleculeSet::selectByActivity ( float aValue )

selects all molecules having activity equal to aValue.

long MoleculeSet::selectByFloatDescriptor ( string aName,

float aValue

)

selects all molecules having float descriptor aName with value aValue, unselects all others.

long MoleculeSet::selectByIntDescriptor ( string aName,

int aValue

)

selects all molecules having int descriptor aName with value aValue, unselects all others.

int MoleculeSet::selectByMW ( float minmw,

float maxmw = -1,

bool addMolecularDescriptor = false

)

selects all molecules in the dataset with mw >= minmw and <= maxmw. if maxmw = -1 then there is no maximum limit. if addMolecularDescriptor is true then a floatDescriptor with label mw is added to the molecule.

int MoleculeSet::selectByNumAtoms ( float minNumAtoms,

float maxNumAtoms = -1,

bool addMolecularDescriptor = false

)

selects all molecules in the dataset with number of atoms >= numAtoms and <= numAtoms. if maxNumAtoms = -1 then there is no maximum limit. if addMolecularDescriptor is true then a intDescriptor with label numAtoms is added to the molecule.

long MoleculeSet::selectHasActivity ( )

selects all molecules which have a defined activity status.

void MoleculeSet::setActivity ( string aMolecule,

float aValue

)

sets the avtivity of aMolecule to aValue.

void MoleculeSet::setComparisonSet ( MoleculeSet * )

sets the comparison set of the moleculeSet.

void MoleculeSet::setIntDescriptor ( string aLabel,

string aMolecule,

int aValue

)

sets the value of intDescriptor aLabel of molecule aLabel to aValue.

void MoleculeSet::setIntDescriptor ( string aName,

int aValue

)

sets integer aName to aValue for all compounds in the dataset.

void MoleculeSet::setKashimaKernelParam ( double aPq,

int aConvergenceCondition,

bool skipSkeleton = false

)

sets the start, stop (aPq) and transition probabilities. sets the start, stop (aPq) and transition probabilities for all molecules according to the article by Kashima et al. using the setKashimaKernelProb(aFloat) function of the Molecule class. WARNING: a call to this function erases the gram Matrix and it will be recalculated.

void MoleculeSet::setMorganChargesLabels ( double threshold )

sets the 'morgan charges' labels of the atoms, i.e., the concatenation of the Morgan labels of the atoms and the (+/-) sign of their partial charges.

void MoleculeSet::setMorganLabels ( int anOrder )

sets the morganLabels of each molecule to the anOrder iteration of the Morgan index calculation process.

void MoleculeSet::setSortDescriptor ( string aName,

int aType,

bool reverse = false

) throw ( CError ) [protected]

sets the type and name of descriptor to be used when sorting molecules. if reverse == true the sorting will be in descending order.

void MoleculeSet::setUniqueMorganIndices ( )

sets the uniqueMorganIndex of each atom to the Morgan index having the maximum of different connectivity values for the molecule.

void MoleculeSet::sortByDescriptor ( string aDescriptorName,

bool reverse = false

)

sorts the molecule collection according to the molecule Descriptor descriptorName. descriptor type will be read from descriptor name if name is of type .integer, or *******.float, set to string otherwise. if reverse = true then the sorting is in decreasing order.

void MoleculeSet::sortByDescriptor ( string aDescriptorName,

int aDescriptorType,

bool reverse = false

)

sorts the molecule collection according to the molecule Descriptor descriptorName of type descriptorType. if reverse = true then the sorting is in decreasing order.

void MoleculeSet::sortByMW ( )

sorts all compounds in the set by Molecular weight.

void MoleculeSet::sortByNumAtoms ( )

sorts all compounds in the set by their number of atoms.

void MoleculeSet::substractToGram ( int row,

int col,

double value

)

substracts a value to a Grammatrix entry.

void MoleculeSet::threeDtransform ( int nBins,

double distMin,

double distMax

)

transforms the molecular graphs into '3D complete graphs' with edges labeled by inter atomic distances in order to compute the pharmacophore kernel (see (Mahe et al., 2006)).

string MoleculeSet::toString ( bool selectedOnly = false )

returns a string description of the MoleculeSet.

string MoleculeSet::toStringLong ( )

returns a long string description of the MoleculeSet.

string MoleculeSet::toStringShort ( )

returns a short string description of the MoleculeSet (number of molecules in the set and short description of each molecule).

int MoleculeSet::unSelect ( vector< string > * aSubset )

unselects the molecules with the names provided as arguments in a vector of string. returns the number of selected molecules.

void MoleculeSet::unSelectAll ( )

unselects all molecules in the dataset. WARNING the selection status is stored in the Molecule class. Therefore molecules in other datasets will be unselected too...

void MoleculeSet::writeActivityFile ( string aFilename,

bool addActivityExtension = true,

string activityDescriptor = ACTIVITY

)

writes a file with the biological activity of molecules in a format compatible with GIST.

void MoleculeSet::writeDescriptors ( string aFileName,

bool selectedOnly = false

) throw ( CError )

writes a ';' separated file containing all descriptors for all molecules.

long MoleculeSet::writeDotsToDir ( string aDirectory,

bool selectedOnly = false,

bool perretLabels = false

)

writes a dot file for all molecule in the set to aDirectory.

void MoleculeSet::writeGramMatrix ( string aFileName,

bool normal = false,

bool self = false,

bool silentMode = false

)

writes the gram matrix.
Examples:
moleculeset_example.cpp.

void MoleculeSet::writeKCF ( string aFileName,

bool selectedOnly = false

)

writes a KCF file with the whole moleculeSet.

long MoleculeSet::writeMolToDir ( string aDirName,

bool selectedOnly = false

)

writes a mol file for each molecule in the set to aDirName. if selectedOnly == true then only selected molecules are written.

void MoleculeSet::writeSD ( string aFileName,

bool selectedOnly = false

)

writes a MDL structure data (SD) file with the molecules in the moleculeSet setting selectedOnly to true outputs only the selectwed compounds. a pointer to a vector of strings containing the molecule names for ordered output can be specified.

void MoleculeSet::writeSelfKernelList ( string aFilename,

bool silentMode = false

)

writes all self kernel values in a file.

void MoleculeSet::writeSubsetKCF ( string aFileName,

vector< string > * anOrder

)

writes a KCF file with the molecules in the moleculeSet matching the names given as argument.

void MoleculeSet::writeSubsetSD ( string aFileName,

vector< string > * anOrder

)

writes a MDL structure data (SD) file with the molecules in the moleculeSet matching the names given as argument.

Member Data Documentation

bool MoleculeSet::activitySet [protected]

stores if the activity of the molecules in the set was specified.

MoleculeSet* MoleculeSet::comparisonSet [protected]

comparison set of the molecule set. In test set mode, the comparison set is another set of compounds. In self set mode, the comparison set is the molecule set itself.

int MoleculeSet::convergenceCondition [protected]

convergence condition used in the calculation of the Kashima Kernel.

vector< vector<double> >* MoleculeSet::gram [protected]

Gram matrix.

bool MoleculeSet::gramCalculated [protected]

contains true if the gram matrix was evaluated for the current MoleculeSet, false otherwise. note that setKashimaKernelProb sets this flag to false. {Addition of a new molecule to the set when the flag is set to true induces the calculation of a new line to the gram matrix DEPRECATED}

vector< vector<double> >* MoleculeSet::gramNormal [protected]

normal Gram matrix.

double MoleculeSet::pq [protected]

kashima Stop probability for the moleculeSet used to set kashimaProb in added molecules.

int MoleculeSet::subsetSize [protected]

subset size.

int MoleculeSet::subsetStart [protected]

subset start.

The documentation for this class was generated from the following file:

src/moleculeset.h

Generated on Wed Nov 28 12:12:52 2007 for ChemCpp by

1.4.6


Public Member Functions
MoleculeSet construction functions
	MoleculeSet ()
	MoleculeSet (const MoleculeSet &aSet)
	~MoleculeSet ()
int	add (MoleculeSet *aSet)
void	addMolecule (Molecule *aMolecule)
Molecule *	addMoleculeCopy (Molecule *aMolecule)
void	deleteAll ()
Input functions
int	addSD (string aFileName, bool genericAtomType=false, long beginMolecule=-1, long endMolecule=-1)
int	addKCF (string aFileName, long beginMolecule=-1, long endMolecule=-1)
Molecule *	addSingleMOL (string aMolFile, bool genericAtomType=false)
Molecule *	addSingleKCF (string aMolFile)
void	readMolDirectory (string aPath, bool genericAtomType=false, long beginMolecule=-1, long endMolecule=-1)
void	readKcfDirectory (string dataDir, long beginMolecule=-1, long endMolecule=-1)
void	addMutag (string aFileName, string rFileName="", uint numMolToRead=500)
void	readActivityFile (string aFileName)
void	readDescriptorFile (string aFileName, string separator=";")
void	readGistClassifyFile (string aFileName)
void	readGistActivityFile (string aFileName, string aDescriptor)
void	readGram (string aFileName, vector< vector< double > > *gram)
void	readGramNormal (string aFileName)
void	readGramRaw (string aFileName)
void	readPartialCharges (string fileName)
Accessor functions
uint	numMolecules ()
Molecule *	operator[] (string aName) throw ( CError )
Molecule *	getMolByName (string aName) throw ( CError )
Molecule *	operator[] (int anInd) throw ( CError )
Molecule *	getMolByIndex (int anInd) throw ( CError )
long	getPossibleValuesInIntDescriptor (string aDescriptorName, vector< int > *p)
double	getPq ()
int	getConvergenceCondition ()
bool	nameExists (string aName)
bool	hasActivity ()
void	setIntDescriptor (string aName, int aValue)
void	setUniqueMorganIndices ()
void	setMorganLabels (int anOrder)
void	setIntDescriptor (string aLabel, string aMolecule, int aValue)
void	setActivity (string aMolecule, float aValue)
void	setKashimaKernelParam (double aPq, int aConvergenceCondition, bool skipSkeleton=false)
void	setComparisonSet (MoleculeSet *)
void	setMorganChargesLabels (double threshold)
MoleculeSet manipulation functions
void	selectAll ()
void	unSelectAll ()
int	select (vector< string > *aSubset)
int	unSelect (vector< string > *aSubset)
long	selectByFloatDescriptor (string aName, float aValue)
long	selectByIntDescriptor (string aName, int aValue)
long	selectByActivity (float aValue)
long	selectHasActivity ()
int	selectByMW (float minmw, float maxmw=-1, bool addMolecularDescriptor=false)
int	selectByNumAtoms (float minNumAtoms, float maxNumAtoms=-1, bool addMolecularDescriptor=false)
void	sortByDescriptor (string aDescriptorName, int aDescriptorType, bool reverse=false)
void	sortByDescriptor (string aDescriptorName, bool reverse=false)
void	sortByMW ()
void	sortByNumAtoms ()
void	binClassifyFromDescriptor (string descriptorName, float value, bool smallerOrEqual=true)
Molecule *	findFirstMoleculeWithName (string aName) throw ( CError )
void	removeDuplicates ()
void	deleteHiddenAtoms ()
void	hideHydrogens ()
void	hideSalts (string aReportFileName="")
void	restoreHiddenAtoms ()
void	addFragmentsToSet (Molecule *aMol, int minAtoms=1)
void	pushFragments (Molecule *aMol, int minAtoms=1)
double	diversityBaryMean ()
vector< string >	atomsLabelsListing ()
vector< string >	atomsSymbolsListing ()
vector< int >	bondsListing ()
void	noTottersTransform ()
void	threeDtransform (int nBins, double distMin, double distMax)
void	minMaxDistances (double distMin, double distMax)
Kernels and Gram matrices related functions
void	gramCompute (double aPq, double(pt2GraphKernel)(Molecule mol1, Molecule mol2, double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(Bond , Bond ), int, int), double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(Bond , Bond *), int aParameter=1000, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false)
void	gramCompute (MoleculeSet anotherSet, double aPq, double(pt2GraphKernel)(Molecule mol1, Molecule mol2, double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(Bond , Bond ), int, int), double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(Bond , Bond ), int aParameter=1000, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false)
void	gramCompute (double aPq, double(pt2GraphKernel)(Molecule mol1, Molecule mol2, double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(Bond , Bond ), int, int), double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(Bond , Bond *), int parameter1=1000, int parameter2=1, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false)
void	gramCompute (MoleculeSet anotherSet, double aPq, double(pt2GraphKernel)(Molecule mol1, Molecule mol2, double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(Bond , Bond ), int, int), double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(Bond , Bond ), int parameter1=1000, int parameter2=1, string aReportFileName="", int nbThreadsWanted=1, bool silentMode=false, bool filterTotters=false)
void	gramCompute3D (double(pt2GraphKernel)(Molecule mol1, Molecule mol2, double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(float, float, float), float), double(pt2AtomKernel)(Atom , Atom ), double(*pt2BondKernel)(float, float, float), float edgeKernelparameter, bool silentMode)
void	gramCompute3D (MoleculeSet anotherSet, double(pt2GraphKernel)(Molecule mol1, Molecule mol2, double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(float, float, float), float), double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(float, float, float), float edgeKernelparameter, bool silentMode)
void	kernelCompute (Molecule aMol, double(pt2GraphKernel)(Molecule mol1, Molecule mol2, double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(Bond , Bond ), int, int), double(pt2AtomKernel)(Atom , Atom ), double(pt2BondKernel)(Bond , Bond ), vector< double > resultsRaw, vector< double > resultsNormal, int convergenceCondition=1000, int parameter2=1, bool silentMode=false)
void	resetGramMatrix ()
void	resetSelfKernels ()
void	initializeGram (double value)
void	initializeSelfKernel (double value)
void	normalizeGram ()
void	normalizeGram_raw ()
void	normalizeTanimoto ()
void	normalizeTanimoto_raw ()
void	normalizeTanimotoMinMax ()
void	addToGram (int row, int col, double value)
void	addToGramNormal (int row, int col, double value)
void	substractToGram (int row, int col, double value)
double	getGramValue (int row, int col)
Output functions
void	writeActivityFile (string aFilename, bool addActivityExtension=true, string activityDescriptor=ACTIVITY)
void	writeGramMatrix (string aFileName, bool normal=false, bool self=false, bool silentMode=false)
void	writeSelfKernelList (string aFilename, bool silentMode=false)
void	writeSD (string aFileName, bool selectedOnly=false)
void	writeSubsetSD (string aFileName, vector< string > *anOrder)
void	writeSubsetKCF (string aFileName, vector< string > *anOrder)
void	writeKCF (string aFileName, bool selectedOnly=false)
string	toString (bool selectedOnly=false)
string	toStringShort ()
string	toStringLong ()
void	describe (bool selectedOnly=false)
void	describeShort ()
void	describeLong ()
void	writeDescriptors (string aFileName, bool selectedOnly=false) throw ( CError )
long	writeMolToDir (string aDirName, bool selectedOnly=false)
long	writeDotsToDir (string aDirectory, bool selectedOnly=false, bool perretLabels=false)
Protected Member Functions
void	setSortDescriptor (string aName, int aType, bool reverse=false) throw ( CError )
string	getSortDescriptorName ()
Protected Attributes
MoleculeSet *	comparisonSet
vector< vector< double > > *	gram
vector< vector< double > > *	gramNormal
bool	gramCalculated
double	pq
int	convergenceCondition
int	subsetStart
int	subsetSize
bool	activitySet