00001 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP 00002 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP 00003 00004 /* $Id: writedb_files.hpp 134303 2008-07-17 17:42:49Z camacho $ 00005 * =========================================================================== 00006 * 00007 * PUBLIC DOMAIN NOTICE 00008 * National Center for Biotechnology Information 00009 * 00010 * This software/database is a "United States Government Work" under the 00011 * terms of the United States Copyright Act. It was written as part of 00012 * the author's official duties as a United States Government employee and 00013 * thus cannot be copyrighted. This software/database is freely available 00014 * to the public for use. The National Library of Medicine and the U.S. 00015 * Government have not placed any restriction on its use or reproduction. 00016 * 00017 * Although all reasonable efforts have been taken to ensure the accuracy 00018 * and reliability of the software and data, the NLM and the U.S. 00019 * Government do not and cannot warrant the performance or results that 00020 * may be obtained by using this software or data. The NLM and the U.S. 00021 * Government disclaim all warranties, express or implied, including 00022 * warranties of performance, merchantability or fitness for any particular 00023 * purpose. 00024 * 00025 * Please cite the author in any work or product based on this material. 00026 * 00027 * =========================================================================== 00028 * 00029 * Author: Kevin Bealer 00030 * 00031 */ 00032 00033 /// @file writedb_files.hpp 00034 /// Code for database files construction. 00035 /// 00036 /// Defines classes: 00037 /// CWriteDBHeader 00038 /// 00039 /// Implemented for: UNIX, MS-Windows 00040 00041 #include "writedb_general.hpp" 00042 #include "writedb_convert.hpp" 00043 #include <objects/seq/seq__.hpp> 00044 #include <corelib/ncbistre.hpp> 00045 #include <corelib/ncbifile.hpp> 00046 00047 BEGIN_NCBI_SCOPE 00048 00049 /// Import definitions from the objects namespace. 00050 USING_SCOPE(objects); 00051 00052 /// CWriteDB_IndexFile class 00053 /// 00054 /// This manufactures blast database index files from input data. 00055 00056 class CWriteDB_File : public CObject { 00057 public: 00058 // Setup and control 00059 00060 /// Constructor. 00061 /// 00062 /// The filename is constructed from basename, extension, and 00063 /// index, but might be changed if the RenameSingle() method is 00064 /// called. If zero is specified for maximum file size, a default 00065 /// size is provided by this class. The maximum file size is not 00066 /// enforced by this class, instead each derived class must do its 00067 /// own enforcement. 00068 /// 00069 /// @param basename Database base name, shared by all files. [in] 00070 /// @param extension File name extension for this file. [in] 00071 /// @param index Volume index used in filename. [in] 00072 /// @param max_file_size File size limit (in bytes). [in] 00073 /// @param always_create If true the file will be created now. [in] 00074 CWriteDB_File(const string & basename, 00075 const string & extension, 00076 int index, 00077 Uint8 max_file_size, 00078 bool always_create); 00079 00080 /// Create and open the file. 00081 /// 00082 /// This method must be called before the first time that data is 00083 /// written to the file. If the constructor is passed 'true' for 00084 /// always_create, this method will be called during construction. 00085 /// It is an error to call this method more than once (including 00086 /// via the constructor) or to not call it but to call Write. The 00087 /// rationale for making this explicit is to permit some files to 00088 /// be created optionally, such as ISAM files, which should only 00089 /// be created if the corresponding ID types are found. 00090 void Create(); 00091 00092 /// Write contents of a string to the file. 00093 /// @param data Data to write. 00094 /// @return File offset after write. 00095 int Write(const CTempString & data); 00096 00097 /// Write an Int4 (in bigendian order) to the file. 00098 /// @param data String to write. 00099 /// @return File offset after write. 00100 int WriteInt4(int data) 00101 { 00102 s_WriteInt4(m_RealFile, data); 00103 m_Offset += 4; 00104 return m_Offset; 00105 } 00106 00107 /// Write an Int8 (in bigendian order) to the file. 00108 /// @param data String to write. 00109 /// @return File offset after write. 00110 int WriteInt8(Int8 data) 00111 { 00112 s_WriteInt8BE(m_RealFile, data); 00113 m_Offset += 8; 00114 return m_Offset; 00115 } 00116 00117 /// Write contents of a string to the file, appending a NUL. 00118 /// @param data String to write. 00119 /// @return File offset after write. 00120 int WriteWithNull(const CTempString & data) 00121 { 00122 Write(data); 00123 return Write(m_Nul); 00124 } 00125 00126 /// Close the file, flushing any remaining data to disk. 00127 void Close(); 00128 00129 /// Rename this file, disincluding the volume index. 00130 virtual void RenameSingle(); 00131 00132 /// Construct the short name for a volume. 00133 /// 00134 /// Volume names consist of the database base name, ".", and the 00135 /// volume index in decimal. The volume index is normally two 00136 /// digits, but if more than 100 volumes are needed, the filename 00137 /// will use three or more index digits as needed. 00138 /// 00139 /// @param base Base name to use. 00140 /// @param index Volume index. 00141 /// @return A short name. 00142 static string MakeShortName(const string & base, int index); 00143 00144 /// Get the current filename for this file. 00145 /// 00146 /// The filename is returned. The data returned by this method 00147 /// reflects changes made by RenameSingle(), so it is probably 00148 /// best to call it after that method has been called (if it will 00149 /// be called). 00150 /// 00151 /// @return The filename. 00152 const string & GetFilename() const 00153 { 00154 return m_Fname; 00155 } 00156 00157 protected: 00158 /// True if the file has already been opened. 00159 bool m_Created; 00160 00161 /// Underlying 'output file' type used here. 00162 typedef ofstream TFile; 00163 00164 /// For convenience, a string containing one NUL character. 00165 string m_Nul; // init me 00166 00167 /// The default value for max_file_size. 00168 /// @return The max file size used if otherwise unspecified. 00169 Uint8 x_DefaultByteLimit() 00170 { 00171 // 1 gb (marketing version) - 1; about a billion 00172 return 1000*1000*1000 - 1; 00173 } 00174 00175 /// This should flush any unwritten data to disk. 00176 /// 00177 /// This method must be implemented by derived classes to flush 00178 /// any unwritten data to disk. In the cases of sequence and 00179 /// header files, it will normally do nothing, because such files 00180 /// are written as the data is available. For index (pin/nin) and 00181 /// ISAM files, this method does most of the disk I/O. 00182 virtual void x_Flush() = 0; 00183 00184 /// Build the filename for this file. 00185 void x_MakeFileName(); 00186 00187 // Configuration 00188 00189 string m_BaseName; ///< Database base name for all files. 00190 string m_Extension; ///< File extension for this file. 00191 int m_Index; ///< Volume index. 00192 int m_Offset; ///< Stream position. 00193 Uint8 m_MaxFileSize; ///< Maximum file size in bytes. 00194 00195 // The file 00196 00197 bool m_UseIndex; ///< True if filenames should use volume index. 00198 string m_Fname; ///< Current filename for output file. 00199 TFile m_RealFile; ///< Actual stream implementing the output file. 00200 }; 00201 00202 // For index file format, see .cpp file. 00203 00204 /// This class builds the volume index file (pin or nin). 00205 class CWriteDB_IndexFile : public CWriteDB_File { 00206 public: 00207 /// Constructor. 00208 /// @param dbname Database base name. 00209 /// @param protein True for protein volumes. 00210 /// @param title Database title string. 00211 /// @param date Timestamp of database construction start. 00212 /// @param index Index of this volume. 00213 /// @param max_file_size Maximum file size in bytes (or zero). 00214 CWriteDB_IndexFile(const string & dbname, 00215 bool protein, 00216 const string & title, 00217 const string & date, 00218 int index, 00219 Uint8 max_file_size); 00220 00221 /// Returns true if another sequence can fit into the file. 00222 bool CanFit() 00223 { 00224 _ASSERT(m_MaxFileSize > 1024); 00225 00226 if (! m_OIDs) 00227 return true; 00228 00229 return m_DataSize < (m_MaxFileSize-12); 00230 } 00231 00232 /// Add a sequence to a protein index file (pin). 00233 /// 00234 /// The index file does not need sequence data, so this method 00235 /// only needs offsets of the data in other files. 00236 /// 00237 /// @param Sequence length in letters. 00238 /// @param hdr Length of binary ASN.1 header data. 00239 /// @param seq Length in bytes of sequence data. 00240 void AddSequence(int length, int hdr, int seq) 00241 { 00242 if (length > m_MaxLength) { 00243 m_MaxLength = length; 00244 } 00245 00246 m_OIDs ++; 00247 m_Letters += length; 00248 m_DataSize += 8; 00249 00250 m_Hdr.push_back(hdr); 00251 m_Seq.push_back(seq); 00252 } 00253 00254 /// Add a sequence to a nucleotide index file (nin). 00255 /// 00256 /// The index file does not need sequence data, so this method 00257 /// only needs offsets of the data in other files. 00258 /// 00259 /// @param Sequence length in letters. 00260 /// @param hdr Length of binary ASN.1 header data. 00261 /// @param seq Length in bytes of packed sequence data. 00262 /// @param seq Length in bytes of packed ambiguity data. 00263 void AddSequence(int length, int hdr, int seq, int amb) 00264 { 00265 if (length > m_MaxLength) { 00266 m_MaxLength = length; 00267 } 00268 00269 m_OIDs ++; 00270 m_Letters += length; 00271 00272 m_DataSize += 12; 00273 m_Hdr.push_back(hdr); 00274 m_Seq.push_back(amb); // Not a bug. 00275 m_Amb.push_back(seq); // Also not a bug. 00276 } 00277 00278 private: 00279 /// Compute index file overhead. This is the overhead used by all 00280 /// fields of the index file, and does account for padding. 00281 /// 00282 /// @param T Title string. 00283 /// @param D Create time string. 00284 /// @return Combined size of all meta-data fields in nin/pin file. 00285 int x_Overhead(const string & T, const string & D); 00286 00287 /// Flush index data to disk. 00288 virtual void x_Flush(); 00289 00290 bool m_Protein; ///< True if this is a protein database. 00291 string m_Title; ///< Title string for all database volumes. 00292 string m_Date; ///< Database creation time stamp. 00293 int m_OIDs; ///< OIDs added to database so far. 00294 int m_Overhead; ///< Amount of file used by metadata. 00295 Uint8 m_DataSize; ///< Required space for data once written to disk. 00296 Uint8 m_Letters; ///< Letters of sequence data accumulated so far. 00297 int m_MaxLength; ///< Length of longest sequence. 00298 00299 // Because the lengths are found via "next offset - this offset", 00300 // each array has an extra element. (This is not necesary in the 00301 // case of m_Amb; the last element is never examined because of 00302 // the alternation of sequences and ambiguities.) 00303 00304 /// Start offset in header file of each OID's headers. 00305 /// 00306 /// The end offset is given by the start offset of the following 00307 /// OID's headers. 00308 vector<int> m_Hdr; 00309 00310 /// Offset in sequence file of each OID's sequence data. 00311 /// 00312 /// The end of the sequence data is given by the start offset of 00313 /// the ambiguity data for the same OID. 00314 vector<int> m_Seq; 00315 00316 /// Offset in sequence file of each OID's ambiguity data. 00317 /// 00318 /// The end of the ambiguity data is given by the start offset of 00319 /// the sequence data for the next OID. 00320 vector<int> m_Amb; 00321 }; 00322 00323 /// This class builds the volume header file (phr or nhr). 00324 class CWriteDB_HeaderFile : public CWriteDB_File { 00325 public: 00326 /// Constructor. 00327 /// @param dbname Database base name. 00328 /// @param protein True for protein volumes. 00329 /// @param index Index of this volume. 00330 /// @param max_file_size Maximum file size in bytes (or zero). 00331 CWriteDB_HeaderFile(const string & dbname, 00332 bool protein, 00333 int index, 00334 Uint8 max_file_size); 00335 00336 /// Returns true if the specified amount of data would fit. 00337 /// 00338 /// If the specified amount of data (in bytes) would fit in the 00339 /// file without exceeding the max_file_size, this method returns 00340 /// true. 00341 /// 00342 /// @param size Size of new data in bytes. 00343 bool CanFit(int size) 00344 { 00345 if (! m_DataSize) { 00346 return true; 00347 } 00348 00349 return (m_DataSize + size) < m_MaxFileSize; 00350 } 00351 00352 /// Add binary header data to this file. 00353 /// @param binhdr Binary ASN.1 version of header data. [in] 00354 /// @param offset Offset of end of header data. [out] 00355 void AddSequence(const string & binhdr, int & offset) 00356 { 00357 m_DataSize = offset = Write(binhdr); 00358 } 00359 00360 private: 00361 /// Flush unwritten data to the output file. 00362 virtual void x_Flush() 00363 { 00364 // There is nothing to do here - header data is written as 00365 // soon as it is added. 00366 } 00367 00368 /// Amount of data written so far. 00369 Uint8 m_DataSize; 00370 }; 00371 00372 class CWriteDB_SequenceFile : public CWriteDB_File { 00373 public: 00374 /// Constructor. 00375 /// @param dbname Database base name. 00376 /// @param protein True for protein volumes. 00377 /// @param index Index of this volume. 00378 /// @param max_file_size Maximum file size in bytes (or zero). 00379 /// @param max_letter Maximum sequence letters per volume (or zero). 00380 CWriteDB_SequenceFile(const string & dbname, 00381 bool protein, 00382 int index, 00383 Uint8 max_file_size, 00384 Uint8 max_letters); 00385 00386 /// Returns true if the specified amount of data would fit. 00387 /// 00388 /// If the specified amount of data (in bytes) would fit in the 00389 /// file without exceeding the max_file_size, and the specified 00390 /// number of letters would fit without exceeding the maximum 00391 /// letters limit, this method returns true. 00392 /// 00393 /// @param size Size of new data in bytes. 00394 /// @param letters Number of sequence letters in new data. 00395 bool CanFit(int size, int letters) 00396 { 00397 if (m_Offset <= 1) { 00398 return true; 00399 } 00400 00401 if (m_BaseLimit && 00402 ((m_Letters + letters) > m_BaseLimit)) { 00403 return false; 00404 } 00405 00406 return ((m_Offset + (unsigned)size) < m_MaxFileSize); 00407 } 00408 00409 /// Add a protein sequence to this file. 00410 /// 00411 /// This method should only be called in the protein case. 00412 /// 00413 /// @param sequence Packed sequence data. [in] 00414 /// @param offset Offset of the end of the sequence data. [out] 00415 /// @param length Length of the sequence in letters. [in] 00416 void AddSequence(const string & sequence, 00417 int & offset, 00418 int length) 00419 { 00420 _ASSERT(m_Protein); 00421 offset = WriteWithNull(sequence); 00422 m_Letters += length; 00423 } 00424 00425 /// Add a nucleotide sequence to this file. 00426 /// 00427 /// This method should only be called in the nucleotide case. 00428 /// 00429 /// @param sequence Packed sequence data. [in] 00430 /// @param ambig Packed ambiguity data. [in] 00431 /// @param off_seq Offset of the end of the sequence data. [out] 00432 /// @param off_amb Offset of the end of the ambiguity data. [out] 00433 /// @param length Length of the sequence in letters. [in] 00434 void AddSequence(const string & sequence, 00435 const string & ambig, 00436 int & off_seq, 00437 int & off_amb, 00438 int length) 00439 { 00440 _ASSERT(! m_Protein); 00441 off_seq = Write(sequence); 00442 off_amb = Write(ambig); 00443 m_Letters += length; 00444 } 00445 00446 private: 00447 /// Flush unwritten data to the output file. 00448 virtual void x_Flush() 00449 { 00450 // There is nothing to do here - sequence data is written as 00451 // soon as it is added. 00452 } 00453 00454 Uint8 m_Letters; ///< Letters of sequence data added so far. 00455 Uint8 m_BaseLimit; ///< Limit on letters of sequence data. 00456 bool m_Protein; ///< True if this is a protein database. 00457 }; 00458 00459 END_NCBI_SCOPE 00460 00461 00462 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP 00463 00464 00465