00001 #ifndef GUI_OBJUTILS___SNP_BITFIELD__HPP 00002 #define GUI_OBJUTILS___SNP_BITFIELD__HPP 00003 00004 /* $Id: snp_bitfield.hpp 17219 2008-06-27 16:51:37Z dicuccio $ 00005 * =========================================================================== 00006 * 00007 * PUBLIC DOMAIN NOTICE 00008 * National Center for Biotechnology Information 00009 * 00010 * This software/database is a "United States Government Work" under the 00011 * terms of the United States Copyright Act. It was written as part of 00012 * the author's official duties as a United States Government employee and 00013 * thus cannot be copyrighted. This software/database is freely available 00014 * to the public for use. The National Library of Medicine and the U.S. 00015 * Government have not placed any restriction on its use or reproduction. 00016 * 00017 * Although all reasonable efforts have been taken to ensure the accuracy 00018 * and reliability of the software and data, the NLM and the U.S. 00019 * Government do not and cannot warrant the performance or results that 00020 * may be obtained by using this software or data. The NLM and the U.S. 00021 * Government disclaim all warranties, express or implied, including 00022 * warranties of performance, merchantability or fitness for any particular 00023 * purpose. 00024 * 00025 * Please cite the author in any work or product based on this material. 00026 * 00027 * =========================================================================== 00028 * 00029 * Authors: Melvin Quintos 00030 * 00031 * File Description: 00032 * 00033 */ 00034 00035 #include <corelib/ncbistd.hpp> 00036 #include <gui/gui_export.h> 00037 00038 #include <memory> 00039 00040 BEGIN_NCBI_SCOPE 00041 00042 class CSnpBitfieldFactory; 00043 00044 /** 00045 * CSnpBitfield is a facade for representing any version of the SNP 00046 * bitfield. A CSnpBitfield is created from a vector<char> data type. 00047 * 00048 * Example: 00049 * vector<char> data = <get data e.g. CUser_field::C_Data::GetOs > 00050 * CSnpBitfield bitfield = data 00051 * 00052 * Internally, the CSnpBitfield uses a Factory (CSnpBitfieldFactory) 00053 * to determine the version/format of the bitfield to create and store. 00054 * Although it is possible to create bitfields from the Factory, it is 00055 * best to use this class, CSnpBitfield, instead. 00056 * 00057 * CSnpBitfield is a facade to the CSnpBitfield::IEncoding interface. 00058 * The CSnpBitfield::IEncoding and CSnpBitfield::EProperty will evolve to 00059 * represent the latest SNP bitfield fields. As newer bitfield versions 00060 * are introduced, all subclasses of CSnpBitfield::IEncoding are recompiled 00061 * to ensure the latest features of the bitfield are backwards compatible. 00062 * Developers that also modify CSnpBitfield and related classes should run the 00063 * unit_test_snp project to test and make sure nothing was broken. 00064 * 00065 * For example: 00066 * CSnpBitfield2 (v2) introduced a byte for version number (Not found in v1.2). 00067 * CSnpBitfield::IEncoding was modified to get version number (e.g. GetVersion). 00068 * CSnpBitfield1_2 (v1.2) was forced to be recompiled. 00069 * Calls to 1.2's implementation of 'GetVersion' return 1 00070 * 00071 **/ 00072 class CSnpBitfield 00073 { 00074 00075 /////////////////////////////////////////////////////////////////////////////// 00076 // Public Structs/Inner-classes/ Enumerations 00077 /////////////////////////////////////////////////////////////////////////////// 00078 public: 00079 00080 enum EProperty 00081 { 00082 // Note: The order of the properties is important. Explicitly 00083 // assigned values are intended. 00084 00085 // DO NOT MODIFY EXISTING ASSIGNED VALUES. 00086 // ADD NEW PROPERTIES TO END OF ENUMERATION 00087 00088 // F1 Link 00089 eHasLinkOut = 0, ///< Has SubmitterLinkOut From SNP->SubSNP->Batch.link_out 00090 eHasSnp3D = 1, ///< Has 3D structure SNP3D 00091 eHasSTS = 2, ///< Has STS Query Entrez to get the current links 00092 eHasEntrez = 3, ///< Has EntrezGene Query Entrez to get the current links 00093 eHasProbeDB = 4, ///< Has ProbeDB Query Entrez to get the current links 00094 eHasGEO = 5, ///< Has GEO Query Entrez to get the current links 00095 eHasAssembly = 6, ///< Has Assembly Query Entrez to get the current links 00096 eHasTrace = 7, ///< Has Trace Query Entrez to get the current links 00097 eFromMgcClone = 8, ///< From MGC clone We have ~20K rs. This bit could be set from specific submitter handle/ batch_id 00098 eHasOrganism = 9, ///< Has OrganismDBLink (Ex. Jackson Lab for mouse) 00099 00100 // F2 Gene Function is handled separately See EFunctionClass 00101 00102 // F3 Map 00103 eIsAssemblySpecific = 10, // Is Assembly specific. This bit is 1 if the snp only maps to one assembly 00104 eHasAssemblyConflict= 11, // Has Assembly conflict. This is for weight 1 and 2 snp that maps to different chromosomes on different assemblies 00105 eHasOtherSameSNP = 12, // Has other snp with exactly the same set of mapping position on NCBI refernce assembly 00106 00107 // F4 Freq 00108 e5PctMinorAllele1Plus = 13, // >5% minor allele frequency in 1+ populations 00109 e5PctMinorAlleleAll = 14, // >5% minor allele frequency in each and all populations. 00110 eIsDoubleHit = 15, // Deprecated in v4+. This bit is set if the rs# is in Jim Mullikin's double hit submission which has been only on human snp. 00111 eIsMutation = 16, // Is mutation (journal citation, explicit fact) low frequency variation that is cited in journal and other reputable sources. 00112 00113 // F5 GTY 00114 eHasGenotype = 17, // Genotypes available. The snp has individual genotype (in SubInd table). 00115 eInHaplotypeSet = 18, // In Haplotype tagging set 00116 eInGenotypeKit = 19, // Marker is on high density genotyping kit (50K density or greater). The snp may have phenotype associations present in dbGaP 00117 00118 // F6 Hapmap 00119 ePhase1Attempted = 20, // Phase 1 attempted all snp in HapMap unfiltered-redundant set 00120 ePhase1Genotyped = 21, // Phase 1 genotyped a subset of above: filtered, non-redundant 00121 ePhase2Attempted = 22, // Phase 2 attempted 00122 ePhase2Genotyped = 23, // Phase 2 genotyped filtered, non-redundant 00123 ePhase3Attempted = 24, // Phase 3 attempted 00124 ePhase3Genotyped = 25, // Phase 3 genotyped filtered, non-redundant 00125 00126 // F7 Phenotype 00127 eHasOMIM_OMIA = 26, // Has OMIM/OMIA 00128 eHasSnpRIF = 27, // Has SnpRIF 00129 eHasLodScore = 28, // Has LOD score 00130 eHasPhenoDB = 29, // Has significant association in dbGaP study 00131 eHasDiseaseInfo = 30, // Submitted as a disease-related mutation and/or present in a locus-specific database 00132 eHasTranscriptionFactor = 31, // Has transcription factor 00133 eHasClinicalAssay = 32, // Variation is interrogated in a clinical diagnostic assay Note: Used to be eHasMPO(Mammalian Pheonotype Ontology), but never used 00134 eHasMeSH = 33, // Has MeSH is linked to a disease 00135 00136 // F8 Variation class is handled separately See EVariationClass 00137 00138 // F9 Quality Check 00139 eHasGenotypeConflict = 34, // Has Genotype Conflict Same (rs, ind), different genotype. N/N is not included 00140 eIsStrainSpecific = 35, // Is Strain Specific 00141 eHasMendelError = 36, // Has Mendelian Error 00142 eHasHardyWeinbergDeviation = 37, // Has Hardy Weinberg deviation 00143 eHasMemberSsConflict = 38, // Has member ss with conflict alleles 00144 eIsWithdrawn = 39, // Is Withdrawn by submitter If one member ss is withdrawn by submitter, then this bit is set. If all member ss' are withdrawn, then the rs is deleted to SNPHistory 00145 00146 // Version 2 additions 00147 // F1 Link 00148 eHasShortReadArchive = 40, // Has Short Read Archive link 00149 00150 // Version 3 additions 00151 // F9 Quality 00152 eIsContigAlleleAbsent = 41, // Contig allele not present in SNP allele list. The reference sequence allele at the mapped position is not present in the SNP allele list, adjusted for orientation 00153 00154 // Version 2 & 3 (hidden in F2, gene function properties. will be moved out of F2 in later bitfield versions) 00155 eHasReference = 42, // A coding region variation where one allele in the set is identical to the reference sequence. FxnCode = 8 00156 00157 // Version 4 Additions 00158 eIsValidated = 43 // This bit is set if the snp has 2+ minor allele count based on frequency of genotype data 00159 /// Add additional properties here. 00160 00161 }; 00162 00163 // A SNP can only be one class of variation 00164 enum EVariationClass 00165 { 00166 eUnknownVariation = 0, 00167 eSingleBase = 1, 00168 eDips = 2, 00169 eHeterozygous = 3, 00170 eMicrosatellite = 4, 00171 eNamedSNP = 5, 00172 eNoVariation = 6, 00173 eMixed = 7, 00174 eMultiBase = 8 00175 }; 00176 00177 // Function class (gene_prop in v1.2) 00178 // A SNP can belong to more than one gene function class 00179 enum EFunctionClass 00180 { 00181 eUnknownFxn = 0, // Uknown 00182 eIntron = 1, // In Intron 00183 eDonor = 2, // In donor splice-site 00184 eAcceptor = 3, // In acceptor splice site 00185 eUTR = 4, // In Exon. location is in a spliced transcript. Is "untranslated region" (UTR) if "In CDS" is false 00186 eSynonymous = 5, // In coding region (CDS). A subset of "Exon" excluding "UTR": SYNONYMOUS if bits 5-7 are false 00187 eNonsense = 6, // Is non-synonymous Nonsense. Changes to STOP codon (TER) 00188 eMissense = 7, // Is non-synonymous Missense. Changes protein peptide 00189 eFrameshift = 8, // Is non-synonymous Frameshift. Changes all downstream amino acids 00190 00191 // Version 2 additions 00192 eInGene = 9, // In gene segment Defined as sequence intervals covered by a gene ID but not having an aligned transcript. FxnCode = 11 00193 eInGene5 = 10, // In 5' gene region FxnCode = 15 00194 eInGene3 = 11, // In 3' gene region FxnCode = 13 00195 eInUTR5 = 12, // In 5' UTR Location is in an untranslated region (UTR). FxnCode = 55 00196 eInUTR3 = 13, // In 3' UTR Location is in an untranslated region (UTR). FxnCode = 53 00197 eMultipleFxn = 14 // Has multiple functions (i.e. fwd strand 5'near gene, rev strand 3'near gene) 00198 // use IsTrue(EFunctionClass) to determine function classes the snp belongs to. 00199 }; 00200 00201 /////////////////////////////////////////////////////////////////////////////// 00202 // Public Methods 00203 /////////////////////////////////////////////////////////////////////////////// 00204 public: 00205 00206 static const char * GetString(EVariationClass e); 00207 static const char * GetString(EFunctionClass e); 00208 static bool IsCompatible(EFunctionClass e1, EFunctionClass e2); 00209 00210 CSnpBitfield(); 00211 CSnpBitfield(const CSnpBitfield &rhs); 00212 CSnpBitfield(const std::vector<char> &rhs); 00213 00214 CSnpBitfield & operator=( const CSnpBitfield &rhs ); 00215 CSnpBitfield & operator=( const std::vector<char> &rhs); 00216 00217 bool IsTrue(EProperty prop) const; 00218 bool IsTrue(EFunctionClass fxn) const; 00219 bool IsTrue(EVariationClass var) const; 00220 int GetWeight() const; 00221 int GetVersion() const; 00222 EVariationClass GetVariationClass() const; 00223 EFunctionClass GetFunctionClass() const; 00224 const char * GetGenePropertyString() const; 00225 const char * GetVariationClassString() const; 00226 const char * GetString() const; 00227 00228 private: 00229 void x_CreateString(); 00230 00231 /////////////////////////////////////////////////////////////////////////////// 00232 // Public Inner Classes 00233 /////////////////////////////////////////////////////////////////////////////// 00234 public: 00235 00236 class IEncoding 00237 { 00238 public: 00239 virtual bool IsTrue(EProperty e) const = 0; 00240 virtual bool IsTrue(EFunctionClass e) const = 0; 00241 virtual int GetWeight() const = 0; 00242 virtual int GetVersion() const = 0; 00243 virtual CSnpBitfield::EFunctionClass GetFunctionClass() const = 0; 00244 virtual CSnpBitfield::EVariationClass GetVariationClass() const = 0; 00245 virtual const char * GetString() const = 0; 00246 virtual IEncoding * Clone() = 0; 00247 virtual ~IEncoding(){}; 00248 }; 00249 00250 /////////////////////////////////////////////////////////////////////////////// 00251 // Private Data 00252 /////////////////////////////////////////////////////////////////////////////// 00253 private: 00254 00255 std::auto_ptr<IEncoding> m_bitfield; // inits to null object 00256 static CSnpBitfieldFactory sm_Factory; // one shared factory 00257 }; 00258 00259 /////////////////////////////////////////////////////////////////////////////// 00260 // Inline methods 00261 /////////////////////////////////////////////////////////////////////////////// 00262 inline bool CSnpBitfield::IsTrue(CSnpBitfield::EProperty prop) const { 00263 return m_bitfield->IsTrue(prop); 00264 } 00265 00266 inline bool CSnpBitfield::IsTrue(CSnpBitfield::EFunctionClass fxn) const { 00267 return m_bitfield->IsTrue(fxn); 00268 } 00269 00270 inline bool CSnpBitfield::IsTrue(CSnpBitfield::EVariationClass var) const { 00271 return (m_bitfield->GetVariationClass() == var); 00272 } 00273 00274 inline int CSnpBitfield::GetWeight() const { 00275 return m_bitfield->GetWeight(); 00276 } 00277 00278 inline CSnpBitfield::EFunctionClass CSnpBitfield::GetFunctionClass() const { 00279 return m_bitfield->GetFunctionClass(); 00280 } 00281 00282 inline CSnpBitfield::EVariationClass CSnpBitfield::GetVariationClass() const { 00283 return m_bitfield->GetVariationClass(); 00284 } 00285 00286 inline const char * CSnpBitfield::GetString() const { 00287 return m_bitfield->GetString(); 00288 } 00289 00290 inline int CSnpBitfield::GetVersion() const { 00291 return m_bitfield->GetVersion(); 00292 } 00293 00294 END_NCBI_SCOPE 00295 00296 #endif // GUI_OBJUTILS___SNP_BITFIELD__HPP 00297 00298 00299