00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #ifndef MOLECULESET_H
00028 #define MOLECULESET_H
00029
00030 #include <vector>
00031 #include <fstream>
00032 #include <sstream>
00033 #include <pthread.h>
00034 #include <math.h>
00035
00036 #include <datacontainer.h>
00037 #include <molecule.h>
00038
00039 #include <kcfmolecule.h>
00040 #include <jlpioutils.h>
00041 #include <constant.h>
00042
00043 #define VERBOSECALC 1
00044
00045
00067 class MoleculeSet : public std::vector<Molecule*> {
00068
00073 public:
00074
00076
00077
00080 MoleculeSet();
00081 MoleculeSet(const MoleculeSet& aSet);
00082
00086 ~MoleculeSet();
00087
00090 int add( MoleculeSet* aSet );
00091
00097 void addMolecule( Molecule* aMolecule );
00098
00101 Molecule* addMoleculeCopy( Molecule* aMolecule );
00102
00105 void deleteAll();
00106
00107
00109
00110
00112
00113
00122 int addSD( string aFileName, bool genericAtomType = false, long beginMolecule = -1, long endMolecule = -1 );
00123
00126 int addKCF( string aFileName, long beginMolecule = -1, long endMolecule = -1 );
00127
00130 Molecule* addSingleMOL( string aMolFile, bool genericAtomType = false );
00131
00134 Molecule* addSingleKCF( string aMolFile );
00135
00138 void readMolDirectory( string aPath, bool genericAtomType = false, long beginMolecule = -1, long endMolecule = -1 );
00139
00142 void readKcfDirectory( string dataDir, long beginMolecule = -1, long endMolecule = -1 );
00143
00150 void addMutag( string aFileName, string rFileName="", uint numMolToRead = 500 );
00151
00152
00160 void readActivityFile( string aFileName );
00161
00184 void readDescriptorFile( string aFileName, string separator = ";" );
00185
00189 void readGistClassifyFile( string aFileName );
00190
00194 void readGistActivityFile( string aFileName, string aDescriptor );
00195
00200 void readGram( string aFileName, vector< vector<double> >* gram );
00201
00204 void readGramNormal( string aFileName );
00205
00208 void readGramRaw( string aFileName );
00209
00214 void readPartialCharges(string fileName);
00215
00216
00218
00219
00220
00221
00222
00224
00225
00228 uint numMolecules(){ return( size() ); }
00229
00235 Molecule* operator[]( string aName ) throw( CError );
00236
00242 Molecule* getMolByName( string aName ) throw( CError );
00243
00246 Molecule* operator[]( int anInd ) throw( CError );
00247
00250 Molecule* getMolByIndex( int anInd ) throw( CError );
00251
00252
00256 long getPossibleValuesInIntDescriptor( string aDescriptorName, vector< int >* p );
00257
00261 double getPq(){ return( pq ); }
00262
00263
00267 int getConvergenceCondition(){ return( convergenceCondition ); }
00268
00269
00272 bool nameExists( string aName );
00273
00277 bool hasActivity(){ return activitySet; }
00278
00281 void setIntDescriptor( string aName, int aValue );
00282
00286 void setUniqueMorganIndices();
00287
00291 void setMorganLabels( int anOrder );
00292
00295 void setIntDescriptor( string aLabel, string aMolecule, int aValue );
00296
00299 void setActivity( string aMolecule, float aValue );
00300
00308 void setKashimaKernelParam( double aPq, int aConvergenceCondition, bool skipSkeleton = false );
00309
00312 void setComparisonSet( MoleculeSet* );
00313
00317 void setMorganChargesLabels(double threshold);
00318
00319
00321
00322
00323
00325
00326
00331 void selectAll();
00332
00337 void unSelectAll();
00338
00342 int select( vector< string >* aSubset );
00343
00347 int unSelect( vector< string >* aSubset );
00348
00351 long selectByFloatDescriptor( string aName, float aValue );
00352
00355 long selectByIntDescriptor( string aName, int aValue );
00356
00359 long selectByActivity( float aValue );
00360
00363 long selectHasActivity();
00364
00370 int selectByMW( float minmw, float maxmw = -1 , bool addMolecularDescriptor = false );
00371
00377 int selectByNumAtoms( float minNumAtoms, float maxNumAtoms = -1 , bool addMolecularDescriptor = false );
00378
00383 void sortByDescriptor( string aDescriptorName, int aDescriptorType, bool reverse = false );
00384
00390 void sortByDescriptor( string aDescriptorName, bool reverse = false );
00391
00394 void sortByMW();
00395
00398 void sortByNumAtoms();
00399
00405 void binClassifyFromDescriptor( string descriptorName, float value, bool smallerOrEqual = true );
00406
00407
00410 Molecule* findFirstMoleculeWithName( string aName ) throw( CError );
00411
00412
00415 void removeDuplicates();
00416
00419 void deleteHiddenAtoms();
00420
00423 void hideHydrogens();
00424
00427 void hideSalts( string aReportFileName = "" );
00428
00431 void restoreHiddenAtoms();
00432
00435 void addFragmentsToSet( Molecule* aMol, int minAtoms = 1 );
00436
00439 void pushFragments( Molecule* aMol, int minAtoms = 1 );
00440
00443 double diversityBaryMean();
00444
00447 vector<string> atomsLabelsListing();
00448
00451 vector<string> atomsSymbolsListing();
00452
00455 vector<int> bondsListing();
00456
00459 void noTottersTransform();
00460
00464 void threeDtransform(int nBins, double distMin, double distMax);
00465
00468 void minMaxDistances(double *distMin, double* distMax);
00469
00470
00472
00473
00474
00476
00477
00482 void gramCompute(
00483 double aPq,
00484 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2,
00485 double(*pt2AtomKernel)(Atom*, Atom*),
00486 double(*pt2BondKernel)(Bond*, Bond*), int, int ),
00487 double(*pt2AtomKernel)(Atom*, Atom*),
00488 double(*pt2BondKernel)(Bond*, Bond*),
00489 int aParameter = 1000,
00490 string aReportFileName = "",
00491 int nbThreadsWanted = 1,
00492 bool silentMode = false,
00493 bool filterTotters = false);
00494
00499 void gramCompute(
00500 MoleculeSet* anotherSet,
00501 double aPq,
00502 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2,
00503 double(*pt2AtomKernel)(Atom*, Atom*),
00504 double(*pt2BondKernel)(Bond*, Bond*), int, int ),
00505 double(*pt2AtomKernel)(Atom*, Atom*),
00506 double(*pt2BondKernel)(Bond*, Bond*),
00507 int aParameter = 1000,
00508 string aReportFileName = "",
00509 int nbThreadsWanted = 1,
00510 bool silentMode = false,
00511 bool filterTotters = false);
00512
00517 void gramCompute(
00518 double aPq,
00519 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2,
00520 double(*pt2AtomKernel)(Atom*, Atom*),
00521 double(*pt2BondKernel)(Bond*, Bond*), int, int ),
00522 double(*pt2AtomKernel)(Atom*, Atom*),
00523 double(*pt2BondKernel)(Bond*, Bond*),
00524 int parameter1 = 1000,
00525 int parameter2 = 1,
00526 string aReportFileName = "",
00527 int nbThreadsWanted = 1,
00528 bool silentMode = false,
00529 bool filterTotters = false);
00530
00533 void gramCompute(
00534 MoleculeSet* anotherSet,
00535 double aPq,
00536 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2,
00537 double(*pt2AtomKernel)(Atom*, Atom*),
00538 double(*pt2BondKernel)(Bond*, Bond*), int, int ),
00539 double(*pt2AtomKernel)(Atom*, Atom*),
00540 double(*pt2BondKernel)(Bond*, Bond*),
00541 int parameter1 = 1000,
00542 int parameter2 = 1,
00543 string aReportFileName = "",
00544 int nbThreadsWanted = 1,
00545 bool silentMode = false,
00546 bool filterTotters = false);
00547
00550 void gramCompute3D(
00551 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2,
00552 double(*pt2AtomKernel)(Atom*, Atom*),
00553 double(*pt2BondKernel)(float,float,float), float),
00554 double(*pt2AtomKernel)(Atom*, Atom*),
00555 double(*pt2BondKernel)(float, float, float),
00556 float edgeKernelparameter,
00557 bool silentMode );
00558
00561 void gramCompute3D(
00562 MoleculeSet* anotherSet,
00563 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2,
00564 double(*pt2AtomKernel)(Atom*, Atom*),
00565 double(*pt2BondKernel)(float,float,float), float),
00566 double(*pt2AtomKernel)(Atom*, Atom*),
00567 double(*pt2BondKernel)(float, float, float),
00568 float edgeKernelparameter,
00569 bool silentMode );
00570
00573 void kernelCompute(
00574 Molecule* aMol,
00575 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2, double(*pt2AtomKernel)(Atom*, Atom*), double(*pt2BondKernel)(Bond*, Bond*), int, int ),
00576 double(*pt2AtomKernel)(Atom*, Atom*),
00577 double(*pt2BondKernel)(Bond*, Bond*),
00578 vector<double>* resultsRaw,
00579 vector<double>* resultsNormal,
00580 int convergenceCondition = 1000,
00581 int parameter2 = 1,
00582 bool silentMode = false
00583 );
00584
00585
00588 void resetGramMatrix();
00589
00592 void resetSelfKernels();
00593
00596 void initializeGram( double value );
00597
00600 void initializeSelfKernel( double value );
00601
00604 void normalizeGram();
00605
00609 void normalizeGram_raw();
00610
00613 void normalizeTanimoto();
00614
00617 void normalizeTanimoto_raw();
00620 void normalizeTanimotoMinMax();
00621
00624 void addToGram( int row, int col, double value );
00625
00628 void addToGramNormal( int row, int col, double value );
00629
00632 void substractToGram( int row, int col, double value );
00633
00636 double getGramValue( int row, int col );
00637
00638
00640
00641
00642
00643
00644
00645
00646
00648
00649
00651 void writeActivityFile( string aFilename, bool addActivityExtension = true, string activityDescriptor = ACTIVITY );
00652
00655 void writeGramMatrix( string aFileName, bool normal = false, bool self = false, bool silentMode = false );
00656
00659 void writeSelfKernelList( string aFilename, bool silentMode = false );
00660
00666 void writeSD( string aFileName, bool selectedOnly = false );
00667
00671 void writeSubsetSD( string aFileName, vector<string>* anOrder );
00672
00676 void writeSubsetKCF( string aFileName, vector<string>* anOrder );
00677
00680 void writeKCF( string aFileName, bool selectedOnly = false );
00681
00684 string toString( bool selectedOnly = false );
00688 string toStringShort();
00689
00692 string toStringLong();
00693
00694
00697 void describe( bool selectedOnly = false );
00698
00701 void describeShort();
00702
00705 void describeLong();
00706
00707
00710 void writeDescriptors( string aFileName, bool selectedOnly = false ) throw( CError ) ;
00711
00715 long writeMolToDir( string aDirName, bool selectedOnly = false );
00716
00719 long writeDotsToDir( string aDirectory, bool selectedOnly = false, bool perretLabels = false );
00720
00721
00723
00724
00725
00726
00727
00728
00731
00732
00733
00734
00735
00736
00737
00738 protected:
00739
00744 MoleculeSet* comparisonSet;
00745
00748 vector< vector<double> >* gram;
00749
00752 vector< vector<double> >* gramNormal;
00753
00759 bool gramCalculated;
00760
00764 double pq;
00765
00768 int convergenceCondition;
00769
00772 int subsetStart;
00773
00776 int subsetSize;
00777
00780 bool activitySet;
00781
00782
00786 void setSortDescriptor( string aName, int aType, bool reverse = false ) throw( CError );
00787
00790 string getSortDescriptorName();
00791
00792
00795
00796
00797
00798 };
00799
00800 #endif