moleculeset.h

00001 /****************************************************************************************
00002                                           moleculeset.h 
00003                                         -----------------
00004     copyright            : (C) 2006 Jean-Luc Perret - Pierre Mahé
00005     email                : jean-luc.perret@unine.ch - pierre.mahe@ensmp.fr
00006  ***************************************************************************************/
00007 
00008 /****************************************************************************************
00009  *                                                                                      *
00010  *      This program is free software; you can redistribute it and/or                   *
00011  *      modify it under the terms of the GNU Lesser General Public                      *
00012  *      License as published by the Free Software Foundation; either                    *
00013  *      version 2.1 of the License, or (at your option) any later version.              *
00014  *                                                                                      *
00015  *      This program is distributed in the hope that it will be useful,                 *
00016  *      but WITHOUT ANY WARRANTY; without even the implied warranty of                  *
00017  *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU               *
00018  *      Lesser General Public License for more details.                                 *
00019  *                                                                                      *
00020  *      You should have received a copy of the GNU Lesser General Public                *
00021  *      License along with this library; if not, write to the Free Software             *
00022  *      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA      *
00023  *                                                                                      *
00024  ****************************************************************************************/
00025 
00026 
00027 #ifndef MOLECULESET_H
00028 #define MOLECULESET_H
00029 
00030 #include <vector>
00031 #include <fstream>
00032 #include <sstream>
00033 #include <pthread.h>
00034 #include <math.h>
00035 
00036 #include <datacontainer.h>
00037 #include <molecule.h>
00038 //#include <moleculeutils.h>
00039 #include <kcfmolecule.h>
00040 #include <jlpioutils.h>
00041 #include <constant.h>
00042 
00043 #define VERBOSECALC 1
00044 
00045 
00067 class MoleculeSet : public std::vector<Molecule*> {
00068 
00073 public:
00074 
00076 
00077 
00080         MoleculeSet();
00081         MoleculeSet(const MoleculeSet& aSet);
00082 
00086         ~MoleculeSet();
00087 
00090         int add( MoleculeSet* aSet );
00091 
00097         void addMolecule( Molecule* aMolecule );
00098 
00101         Molecule* addMoleculeCopy( Molecule* aMolecule );
00102 
00105         void deleteAll();
00106 
00107 
00109 
00110 
00112 
00113 
00122         int addSD( string aFileName, bool genericAtomType = false, long beginMolecule = -1, long endMolecule = -1 );
00123 
00126         int addKCF( string aFileName, long beginMolecule = -1, long endMolecule = -1  );
00127 
00130         Molecule* addSingleMOL( string aMolFile, bool genericAtomType = false );
00131 
00134         Molecule* addSingleKCF( string aMolFile );
00135 
00138         void readMolDirectory( string aPath, bool genericAtomType = false, long beginMolecule = -1, long endMolecule = -1 );
00139 
00142         void readKcfDirectory( string dataDir, long beginMolecule = -1, long endMolecule = -1 );
00143 
00150         void addMutag( string aFileName, string rFileName="", uint numMolToRead = 500 );
00151 
00152 
00160         void readActivityFile( string aFileName );
00161 
00184         void readDescriptorFile( string aFileName, string separator = ";" );
00185 
00189         void readGistClassifyFile( string aFileName );
00190 
00194         void readGistActivityFile( string aFileName, string aDescriptor );
00195 
00200         void readGram( string aFileName, vector< vector<double> >* gram );
00201 
00204         void readGramNormal( string aFileName );
00205         
00208         void readGramRaw( string aFileName );
00209 
00214         void readPartialCharges(string fileName);
00215 
00216 
00218 
00219 
00220 
00221 
00222 
00224 
00225 
00228         uint numMolecules(){ return( size() ); }
00229 
00235         Molecule* operator[]( string aName ) throw( CError );
00236 
00242         Molecule* getMolByName( string aName ) throw( CError );
00243 
00246         Molecule* operator[]( int anInd ) throw( CError );
00247 
00250         Molecule* getMolByIndex( int anInd ) throw( CError );
00251 
00252 
00256         long getPossibleValuesInIntDescriptor( string aDescriptorName, vector< int >* p );
00257 
00261         double getPq(){ return( pq ); }
00262 
00263 
00267         int getConvergenceCondition(){ return( convergenceCondition ); }
00268 
00269 
00272         bool nameExists( string aName );
00273 
00277         bool hasActivity(){ return activitySet; }
00278 
00281         void setIntDescriptor( string aName, int aValue );
00282 
00286         void setUniqueMorganIndices();
00287 
00291         void setMorganLabels( int anOrder );
00292 
00295         void setIntDescriptor( string aLabel, string aMolecule, int aValue );
00296 
00299         void setActivity( string aMolecule, float aValue );
00300   
00308         void setKashimaKernelParam( double aPq, int aConvergenceCondition, bool skipSkeleton = false );
00309 
00312         void setComparisonSet( MoleculeSet* );
00313         
00317         void setMorganChargesLabels(double threshold);
00318 
00319 
00321 
00322 
00323 
00325 
00326 
00331         void selectAll();
00332 
00337         void unSelectAll();
00338 
00342         int select( vector< string >* aSubset );
00343 
00347         int unSelect( vector< string >* aSubset );
00348 
00351         long selectByFloatDescriptor( string aName, float aValue );
00352 
00355         long selectByIntDescriptor( string aName, int aValue );
00356 
00359         long selectByActivity( float aValue );
00360 
00363         long selectHasActivity();
00364 
00370         int selectByMW( float minmw, float maxmw = -1 , bool addMolecularDescriptor = false );
00371 
00377         int selectByNumAtoms( float minNumAtoms, float maxNumAtoms = -1 , bool addMolecularDescriptor = false );
00378 
00383         void sortByDescriptor( string aDescriptorName, int aDescriptorType, bool reverse = false );
00384 
00390         void sortByDescriptor( string aDescriptorName, bool reverse = false );
00391 
00394         void sortByMW();
00395 
00398         void sortByNumAtoms();
00399 
00405         void binClassifyFromDescriptor( string descriptorName, float value, bool smallerOrEqual = true );
00406 
00407 
00410         Molecule* findFirstMoleculeWithName( string aName ) throw( CError );
00411 
00412 
00415         void removeDuplicates();
00416 
00419         void deleteHiddenAtoms();
00420 
00423         void hideHydrogens();
00424 
00427         void hideSalts( string aReportFileName = "" );
00428 
00431         void restoreHiddenAtoms();
00432 
00435         void addFragmentsToSet( Molecule* aMol, int minAtoms = 1 );
00436 
00439         void pushFragments( Molecule* aMol, int minAtoms = 1 );
00440 
00443         double diversityBaryMean();
00444 
00447         vector<string> atomsLabelsListing();
00448 
00451         vector<string> atomsSymbolsListing();
00452 
00455         vector<int> bondsListing();
00456 
00459         void noTottersTransform();
00460         
00464         void threeDtransform(int nBins, double distMin, double distMax);
00465 
00468         void minMaxDistances(double *distMin, double* distMax);
00469 
00470 
00472 
00473 
00474 
00476 
00477 
00482         void gramCompute(
00483                 double aPq,
00484                 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2,
00485                                          double(*pt2AtomKernel)(Atom*, Atom*),
00486                                          double(*pt2BondKernel)(Bond*, Bond*), int, int ),
00487                 double(*pt2AtomKernel)(Atom*, Atom*),
00488                 double(*pt2BondKernel)(Bond*, Bond*),
00489                 int aParameter = 1000,
00490                 string aReportFileName = "",
00491                 int nbThreadsWanted = 1,
00492                 bool silentMode = false,
00493                 bool filterTotters = false);
00494 
00499         void gramCompute(
00500                 MoleculeSet* anotherSet,
00501                 double aPq,
00502                 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2,
00503                 double(*pt2AtomKernel)(Atom*, Atom*),
00504                 double(*pt2BondKernel)(Bond*, Bond*), int, int ),
00505                 double(*pt2AtomKernel)(Atom*, Atom*),
00506                 double(*pt2BondKernel)(Bond*, Bond*),
00507                 int aParameter = 1000,
00508                 string aReportFileName = "",
00509                 int nbThreadsWanted = 1,
00510                 bool silentMode = false, 
00511                 bool filterTotters = false);
00512 
00517         void gramCompute(
00518                 double aPq,
00519                 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2,
00520                 double(*pt2AtomKernel)(Atom*, Atom*),
00521                 double(*pt2BondKernel)(Bond*, Bond*), int, int ),
00522                 double(*pt2AtomKernel)(Atom*, Atom*),
00523                 double(*pt2BondKernel)(Bond*, Bond*),
00524                 int parameter1 = 1000,
00525                 int parameter2 = 1,
00526                 string aReportFileName = "",
00527                 int nbThreadsWanted = 1,
00528                 bool silentMode = false, 
00529                 bool filterTotters = false);
00530 
00533         void gramCompute(
00534                 MoleculeSet* anotherSet,
00535                 double aPq,
00536                 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2,
00537                                          double(*pt2AtomKernel)(Atom*, Atom*),
00538                                          double(*pt2BondKernel)(Bond*, Bond*), int, int ),
00539                 double(*pt2AtomKernel)(Atom*, Atom*),
00540                 double(*pt2BondKernel)(Bond*, Bond*),
00541                 int parameter1 = 1000,
00542                 int parameter2 = 1,
00543                 string aReportFileName = "",
00544                 int nbThreadsWanted = 1,
00545                 bool silentMode = false, 
00546                 bool filterTotters = false);
00547 
00550         void gramCompute3D( 
00551                          double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2, 
00552                                                   double(*pt2AtomKernel)(Atom*, Atom*), 
00553                                                   double(*pt2BondKernel)(float,float,float), float), 
00554                          double(*pt2AtomKernel)(Atom*, Atom*), 
00555                          double(*pt2BondKernel)(float, float, float),
00556                          float edgeKernelparameter, 
00557                          bool silentMode );
00558 
00561         void gramCompute3D( 
00562                            MoleculeSet* anotherSet,
00563                            double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2, 
00564                                                     double(*pt2AtomKernel)(Atom*, Atom*), 
00565                                                     double(*pt2BondKernel)(float,float,float), float), 
00566                            double(*pt2AtomKernel)(Atom*, Atom*), 
00567                            double(*pt2BondKernel)(float, float, float),
00568                            float edgeKernelparameter, 
00569                            bool silentMode );
00570 
00573         void kernelCompute(
00574                 Molecule* aMol,
00575                 double(*pt2GraphKernel)( Molecule* mol1, Molecule* mol2, double(*pt2AtomKernel)(Atom*, Atom*), double(*pt2BondKernel)(Bond*, Bond*), int, int ),
00576                 double(*pt2AtomKernel)(Atom*, Atom*),
00577                 double(*pt2BondKernel)(Bond*, Bond*),
00578                 vector<double>* resultsRaw,
00579                 vector<double>* resultsNormal,
00580                 int convergenceCondition = 1000,
00581                 int parameter2 = 1,
00582                 bool silentMode = false
00583         );
00584 
00585 
00588         void resetGramMatrix();
00589 
00592         void resetSelfKernels();
00593 
00596         void initializeGram( double value );
00597 
00600         void initializeSelfKernel( double value );
00601 
00604         void normalizeGram();
00605 
00609         void normalizeGram_raw();
00610         
00613         void normalizeTanimoto();
00614 
00617         void normalizeTanimoto_raw();   
00620         void normalizeTanimotoMinMax();
00621 
00624         void addToGram( int row, int col, double value );
00625 
00628         void addToGramNormal( int row, int col, double value );
00629 
00632         void substractToGram( int row, int col, double value );
00633 
00636         double getGramValue( int row, int col );
00637 
00638 
00640 
00641 
00642 
00643 
00644 
00645 
00646 
00648 
00649 
00651         void writeActivityFile( string aFilename, bool addActivityExtension = true, string activityDescriptor = ACTIVITY );
00652 
00655         void writeGramMatrix( string aFileName, bool normal = false, bool self = false, bool silentMode = false );
00656 
00659         void writeSelfKernelList( string aFilename, bool silentMode = false );
00660 
00666         void writeSD( string aFileName, bool selectedOnly = false );
00667 
00671         void writeSubsetSD( string aFileName, vector<string>* anOrder );
00672 
00676         void writeSubsetKCF( string aFileName, vector<string>* anOrder );
00677 
00680         void writeKCF( string aFileName, bool selectedOnly = false );
00681 
00684         string toString( bool selectedOnly = false );
00688         string toStringShort();
00689 
00692         string toStringLong();
00693 
00694 
00697         void describe( bool selectedOnly = false );
00698 
00701         void describeShort();
00702 
00705         void describeLong();
00706 
00707 
00710         void writeDescriptors( string aFileName, bool selectedOnly = false ) throw( CError ) ;
00711 
00715         long writeMolToDir( string aDirName, bool selectedOnly = false );
00716 
00719         long writeDotsToDir( string aDirectory, bool selectedOnly = false, bool perretLabels = false  );
00720 
00721 
00723 
00724 
00725 
00726 // DEPRECATED FUNCTIONS : 
00727 //void addMolecule( Molecule* aMolecule, bool updateGram = true );
00728 
00731 //void pop_back();
00732 //void normalizeGram_self();
00733 //void normalizeGram_test();
00734 //void setSelfGram();
00735         
00736 
00737 
00738 protected: // Protected attributes
00739 
00744         MoleculeSet* comparisonSet;
00745 
00748         vector< vector<double> >* gram;
00749 
00752         vector< vector<double> >* gramNormal;
00753 
00759         bool gramCalculated;
00760 
00764         double pq;
00765 
00768         int convergenceCondition;
00769 
00772         int subsetStart;
00773 
00776         int subsetSize;
00777 
00780         bool activitySet;
00781 
00782 
00786         void setSortDescriptor( string aName, int aType, bool reverse = false ) throw( CError );
00787 
00790         string getSortDescriptorName();
00791 
00792 
00795         //vector<Molecule*> molecules;
00796 
00797 
00798 };
00799 
00800 #endif

Generated on Wed Nov 28 12:12:51 2007 for ChemCpp by  doxygen 1.4.6