NCBI C++ ToolKit: src/objtools/blast/seqdb_writer/writedb

00001 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP
00002 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP
00003 
00004 /*  $Id: writedb_files.hpp 134303 2008-07-17 17:42:49Z camacho $
00005  * ===========================================================================
00006  *
00007  *                            PUBLIC DOMAIN NOTICE
00008  *               National Center for Biotechnology Information
00009  *
00010  *  This software/database is a "United States Government Work" under the
00011  *  terms of the United States Copyright Act.  It was written as part of
00012  *  the author's official duties as a United States Government employee and
00013  *  thus cannot be copyrighted.  This software/database is freely available
00014  *  to the public for use. The National Library of Medicine and the U.S.
00015  *  Government have not placed any restriction on its use or reproduction.
00016  *
00017  *  Although all reasonable efforts have been taken to ensure the accuracy
00018  *  and reliability of the software and data, the NLM and the U.S.
00019  *  Government do not and cannot warrant the performance or results that
00020  *  may be obtained by using this software or data. The NLM and the U.S.
00021  *  Government disclaim all warranties, express or implied, including
00022  *  warranties of performance, merchantability or fitness for any particular
00023  *  purpose.
00024  *
00025  *  Please cite the author in any work or product based on this material.
00026  *
00027  * ===========================================================================
00028  *
00029  * Author:  Kevin Bealer
00030  *
00031  */
00032 
00033 /// @file writedb_files.hpp
00034 /// Code for database files construction.
00035 ///
00036 /// Defines classes:
00037 ///     CWriteDBHeader
00038 ///
00039 /// Implemented for: UNIX, MS-Windows
00040 
00041 #include "writedb_general.hpp"
00042 #include "writedb_convert.hpp"
00043 #include <objects/seq/seq__.hpp>
00044 #include <corelib/ncbistre.hpp>
00045 #include <corelib/ncbifile.hpp>
00046 
00047 BEGIN_NCBI_SCOPE
00048 
00049 /// Import definitions from the objects namespace.
00050 USING_SCOPE(objects);
00051 
00052 /// CWriteDB_IndexFile class
00053 /// 
00054 /// This manufactures blast database index files from input data.
00055 
00056 class CWriteDB_File : public CObject {
00057 public:
00058     // Setup and control
00059     
00060     /// Constructor.
00061     ///
00062     /// The filename is constructed from basename, extension, and
00063     /// index, but might be changed if the RenameSingle() method is
00064     /// called.  If zero is specified for maximum file size, a default
00065     /// size is provided by this class.  The maximum file size is not
00066     /// enforced by this class, instead each derived class must do its
00067     /// own enforcement.
00068     ///
00069     /// @param basename Database base name, shared by all files. [in]
00070     /// @param extension File name extension for this file. [in]
00071     /// @param index Volume index used in filename. [in]
00072     /// @param max_file_size File size limit (in bytes). [in]
00073     /// @param always_create If true the file will be created now. [in]
00074     CWriteDB_File(const string & basename,
00075                   const string & extension,
00076                   int            index,
00077                   Uint8          max_file_size,
00078                   bool           always_create);
00079     
00080     /// Create and open the file.
00081     ///
00082     /// This method must be called before the first time that data is
00083     /// written to the file.  If the constructor is passed 'true' for
00084     /// always_create, this method will be called during construction.
00085     /// It is an error to call this method more than once (including
00086     /// via the constructor) or to not call it but to call Write.  The
00087     /// rationale for making this explicit is to permit some files to
00088     /// be created optionally, such as ISAM files, which should only
00089     /// be created if the corresponding ID types are found.
00090     void Create();
00091     
00092     /// Write contents of a string to the file.
00093     /// @param data Data to write.
00094     /// @return File offset after write.
00095     int Write(const CTempString & data);
00096     
00097     /// Write an Int4 (in bigendian order) to the file.
00098     /// @param data String to write.
00099     /// @return File offset after write.
00100     int WriteInt4(int data)
00101     {
00102         s_WriteInt4(m_RealFile, data);
00103         m_Offset += 4;
00104         return m_Offset;
00105     }
00106     
00107     /// Write an Int8 (in bigendian order) to the file.
00108     /// @param data String to write.
00109     /// @return File offset after write.
00110     int WriteInt8(Int8 data)
00111     {
00112         s_WriteInt8BE(m_RealFile, data);
00113         m_Offset += 8;
00114         return m_Offset;
00115     }
00116     
00117     /// Write contents of a string to the file, appending a NUL.
00118     /// @param data String to write.
00119     /// @return File offset after write.
00120     int WriteWithNull(const CTempString & data)
00121     {
00122         Write(data);
00123         return Write(m_Nul);
00124     }
00125     
00126     /// Close the file, flushing any remaining data to disk.
00127     void Close();
00128     
00129     /// Rename this file, disincluding the volume index.
00130     virtual void RenameSingle();
00131     
00132     /// Construct the short name for a volume.
00133     ///
00134     /// Volume names consist of the database base name, ".", and the
00135     /// volume index in decimal.  The volume index is normally two
00136     /// digits, but if more than 100 volumes are needed, the filename
00137     /// will use three or more index digits as needed.
00138     ///
00139     /// @param base Base name to use.
00140     /// @param index Volume index.
00141     /// @return A short name.
00142     static string MakeShortName(const string & base, int index);
00143     
00144     /// Get the current filename for this file.
00145     ///
00146     /// The filename is returned.  The data returned by this method
00147     /// reflects changes made by RenameSingle(), so it is probably
00148     /// best to call it after that method has been called (if it will
00149     /// be called).
00150     ///
00151     /// @return The filename.
00152     const string & GetFilename() const
00153     {
00154         return m_Fname;
00155     }
00156     
00157 protected:
00158     /// True if the file has already been opened.
00159     bool m_Created;
00160     
00161     /// Underlying 'output file' type used here.
00162     typedef ofstream TFile;
00163     
00164     /// For convenience, a string containing one NUL character.
00165     string m_Nul; // init me
00166     
00167     /// The default value for max_file_size.
00168     /// @return The max file size used if otherwise unspecified.
00169     Uint8 x_DefaultByteLimit()
00170     {
00171         // 1 gb (marketing version) - 1; about a billion
00172         return 1000*1000*1000 - 1;
00173     }
00174     
00175     /// This should flush any unwritten data to disk.
00176     ///
00177     /// This method must be implemented by derived classes to flush
00178     /// any unwritten data to disk.  In the cases of sequence and
00179     /// header files, it will normally do nothing, because such files
00180     /// are written as the data is available.  For index (pin/nin) and
00181     /// ISAM files, this method does most of the disk I/O.
00182     virtual void x_Flush() = 0;
00183     
00184     /// Build the filename for this file.
00185     void x_MakeFileName();
00186     
00187     // Configuration
00188     
00189     string m_BaseName;    ///< Database base name for all files.
00190     string m_Extension;   ///< File extension for this file.
00191     int    m_Index;       ///< Volume index.
00192     int    m_Offset;      ///< Stream position.
00193     Uint8  m_MaxFileSize; ///< Maximum file size in bytes.
00194     
00195     // The file
00196     
00197     bool   m_UseIndex; ///< True if filenames should use volume index.
00198     string m_Fname;    ///< Current filename for output file.
00199     TFile  m_RealFile; ///< Actual stream implementing the output file.
00200 };
00201 
00202 // For index file format, see .cpp file.
00203 
00204 /// This class builds the volume index file (pin or nin).
00205 class CWriteDB_IndexFile : public CWriteDB_File {
00206 public:
00207     /// Constructor.
00208     /// @param dbname Database base name.
00209     /// @param protein True for protein volumes.
00210     /// @param title Database title string.
00211     /// @param date Timestamp of database construction start.
00212     /// @param index Index of this volume.
00213     /// @param max_file_size Maximum file size in bytes (or zero).
00214     CWriteDB_IndexFile(const string & dbname,
00215                        bool           protein,
00216                        const string & title,
00217                        const string & date,
00218                        int            index,
00219                        Uint8          max_file_size);
00220     
00221     /// Returns true if another sequence can fit into the file.
00222     bool CanFit()
00223     {
00224         _ASSERT(m_MaxFileSize > 1024);
00225         
00226         if (! m_OIDs)
00227             return true;
00228         
00229         return m_DataSize < (m_MaxFileSize-12);
00230     }
00231     
00232     /// Add a sequence to a protein index file (pin).
00233     ///
00234     /// The index file does not need sequence data, so this method
00235     /// only needs offsets of the data in other files.
00236     ///
00237     /// @param Sequence length in letters.
00238     /// @param hdr Length of binary ASN.1 header data.
00239     /// @param seq Length in bytes of sequence data.
00240     void AddSequence(int length, int hdr, int seq)
00241     {
00242         if (length > m_MaxLength) {
00243             m_MaxLength = length;
00244         }
00245         
00246         m_OIDs ++;
00247         m_Letters += length;
00248         m_DataSize += 8;
00249         
00250         m_Hdr.push_back(hdr);
00251         m_Seq.push_back(seq);
00252     }
00253     
00254     /// Add a sequence to a nucleotide index file (nin).
00255     ///
00256     /// The index file does not need sequence data, so this method
00257     /// only needs offsets of the data in other files.
00258     ///
00259     /// @param Sequence length in letters.
00260     /// @param hdr Length of binary ASN.1 header data.
00261     /// @param seq Length in bytes of packed sequence data.
00262     /// @param seq Length in bytes of packed ambiguity data.
00263     void AddSequence(int length, int hdr, int seq, int amb)
00264     {
00265         if (length > m_MaxLength) {
00266             m_MaxLength = length;
00267         }
00268         
00269         m_OIDs ++;
00270         m_Letters += length;
00271         
00272         m_DataSize += 12;
00273         m_Hdr.push_back(hdr);
00274         m_Seq.push_back(amb); // Not a bug.
00275         m_Amb.push_back(seq); // Also not a bug.
00276     }
00277     
00278 private:
00279     /// Compute index file overhead.  This is the overhead used by all
00280     /// fields of the index file, and does account for padding.
00281     ///
00282     /// @param T Title string.
00283     /// @param D Create time string.
00284     /// @return Combined size of all meta-data fields in nin/pin file.
00285     int x_Overhead(const string & T, const string & D);
00286     
00287     /// Flush index data to disk.
00288     virtual void x_Flush();
00289     
00290     bool   m_Protein;   ///< True if this is a protein database.
00291     string m_Title;     ///< Title string for all database volumes.
00292     string m_Date;      ///< Database creation time stamp.
00293     int    m_OIDs;      ///< OIDs added to database so far.
00294     int    m_Overhead;  ///< Amount of file used by metadata.
00295     Uint8  m_DataSize;  ///< Required space for data once written to disk.
00296     Uint8  m_Letters;   ///< Letters of sequence data accumulated so far.
00297     int    m_MaxLength; ///< Length of longest sequence.
00298     
00299     // Because the lengths are found via "next offset - this offset",
00300     // each array has an extra element.  (This is not necesary in the
00301     // case of m_Amb; the last element is never examined because of
00302     // the alternation of sequences and ambiguities.)
00303     
00304     /// Start offset in header file of each OID's headers.
00305     ///
00306     /// The end offset is given by the start offset of the following
00307     /// OID's headers.
00308     vector<int> m_Hdr;
00309     
00310     /// Offset in sequence file of each OID's sequence data.
00311     ///
00312     /// The end of the sequence data is given by the start offset of
00313     /// the ambiguity data for the same OID.
00314     vector<int> m_Seq;
00315     
00316     /// Offset in sequence file of each OID's ambiguity data.
00317     ///
00318     /// The end of the ambiguity data is given by the start offset of
00319     /// the sequence data for the next OID.
00320     vector<int> m_Amb;
00321 };
00322 
00323 /// This class builds the volume header file (phr or nhr).
00324 class CWriteDB_HeaderFile : public CWriteDB_File {
00325 public:
00326     /// Constructor.
00327     /// @param dbname Database base name.
00328     /// @param protein True for protein volumes.
00329     /// @param index Index of this volume.
00330     /// @param max_file_size Maximum file size in bytes (or zero).
00331     CWriteDB_HeaderFile(const string & dbname,
00332                         bool           protein,
00333                         int            index,
00334                         Uint8          max_file_size);
00335     
00336     /// Returns true if the specified amount of data would fit.
00337     ///
00338     /// If the specified amount of data (in bytes) would fit in the
00339     /// file without exceeding the max_file_size, this method returns
00340     /// true.
00341     ///
00342     /// @param size Size of new data in bytes.
00343     bool CanFit(int size)
00344     {
00345         if (! m_DataSize) {
00346             return true;
00347         }
00348         
00349         return (m_DataSize + size) < m_MaxFileSize;
00350     }
00351     
00352     /// Add binary header data to this file.
00353     /// @param binhdr Binary ASN.1 version of header data. [in]
00354     /// @param offset Offset of end of header data. [out]
00355     void AddSequence(const string & binhdr, int & offset)
00356     {
00357         m_DataSize = offset = Write(binhdr);
00358     }
00359     
00360 private:
00361     /// Flush unwritten data to the output file.
00362     virtual void x_Flush()
00363     {
00364         // There is nothing to do here - header data is written as
00365         // soon as it is added.
00366     }
00367     
00368     /// Amount of data written so far.
00369     Uint8 m_DataSize;
00370 };
00371 
00372 class CWriteDB_SequenceFile : public CWriteDB_File {
00373 public:
00374     /// Constructor.
00375     /// @param dbname Database base name.
00376     /// @param protein True for protein volumes.
00377     /// @param index Index of this volume.
00378     /// @param max_file_size Maximum file size in bytes (or zero).
00379     /// @param max_letter Maximum sequence letters per volume (or zero).
00380     CWriteDB_SequenceFile(const string & dbname,
00381                           bool           protein,
00382                           int            index,
00383                           Uint8          max_file_size,
00384                           Uint8          max_letters);
00385     
00386     /// Returns true if the specified amount of data would fit.
00387     ///
00388     /// If the specified amount of data (in bytes) would fit in the
00389     /// file without exceeding the max_file_size, and the specified
00390     /// number of letters would fit without exceeding the maximum
00391     /// letters limit, this method returns true.
00392     ///
00393     /// @param size Size of new data in bytes.
00394     /// @param letters Number of sequence letters in new data.
00395     bool CanFit(int size, int letters)
00396     {
00397         if (m_Offset <= 1) {
00398             return true;
00399         }
00400         
00401         if (m_BaseLimit &&
00402             ((m_Letters + letters) > m_BaseLimit)) {
00403             return false;
00404         }
00405         
00406         return ((m_Offset + (unsigned)size)  < m_MaxFileSize);
00407     }
00408     
00409     /// Add a protein sequence to this file.
00410     ///
00411     /// This method should only be called in the protein case.
00412     ///
00413     /// @param sequence Packed sequence data. [in]
00414     /// @param offset Offset of the end of the sequence data. [out]
00415     /// @param length Length of the sequence in letters. [in]
00416     void AddSequence(const string & sequence,
00417                      int          & offset,
00418                      int            length)
00419     {
00420         _ASSERT(m_Protein);
00421         offset = WriteWithNull(sequence);
00422         m_Letters += length;
00423     }
00424     
00425     /// Add a nucleotide sequence to this file.
00426     ///
00427     /// This method should only be called in the nucleotide case.
00428     ///
00429     /// @param sequence Packed sequence data. [in]
00430     /// @param ambig Packed ambiguity data. [in]
00431     /// @param off_seq Offset of the end of the sequence data. [out]
00432     /// @param off_amb Offset of the end of the ambiguity data. [out]
00433     /// @param length Length of the sequence in letters. [in]
00434     void AddSequence(const string & sequence,
00435                      const string & ambig,
00436                      int          & off_seq,
00437                      int          & off_amb,
00438                      int            length)
00439     {
00440         _ASSERT(! m_Protein);
00441         off_seq = Write(sequence);
00442         off_amb = Write(ambig);
00443         m_Letters += length;
00444     }
00445     
00446 private:
00447     /// Flush unwritten data to the output file.
00448     virtual void x_Flush()
00449     {
00450         // There is nothing to do here - sequence data is written as
00451         // soon as it is added.
00452     }
00453     
00454     Uint8 m_Letters;   ///< Letters of sequence data added so far.
00455     Uint8 m_BaseLimit; ///< Limit on letters of sequence data.
00456     bool  m_Protein;   ///< True if this is a protein database.
00457 };
00458 
00459 END_NCBI_SCOPE
00460 
00461 
00462 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP
00463 
00464 
00465
src/objtools/blast/seqdb_writer/writedb_files.hpp