Changeset 16232

Show
Ignore:
Timestamp:
15/07/08 16:20:23 (1 month ago)
Author:
julie
Message:

create synonyms for protein domains. Refs #1709

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/bio/sources/interpro/.classpath

    r9860 r16232  
    1111    <classpathentry combineaccessrules="false" kind="src" path="/intermine-integrate-test"/> 
    1212    <classpathentry combineaccessrules="false" kind="src" path="/bio-postprocess-main"/> 
     13    <classpathentry kind="lib" path="test/resources"/> 
    1314    <classpathentry kind="output" path="bin"/> 
    1415</classpath> 
  • trunk/bio/sources/interpro/main/src/org/intermine/bio/dataconversion/InterProConverter.java

    r15808 r16232  
    1313import java.io.Reader; 
    1414import java.util.ArrayList; 
     15import java.util.Collections; 
    1516import java.util.HashMap; 
    1617import java.util.Map; 
     
    3839public class InterProConverter extends FileConverter 
    3940{ 
    40     private Map<String, Object> mapMaster = new HashMap<String, Object>(); 
    4141    protected static final String GENOMIC_NS = "http://www.flymine.org/model/genomic#"; 
    4242    private Map<String, Item> pubMaster = new HashMap<String, Item>(); 
     
    5858     */ 
    5959    public void process(Reader reader) throws Exception { 
    60         mapMaps(); 
    61         InterProHandler handler = new InterProHandler(getItemWriter(), mapMaster); 
    62  
     60        InterProHandler handler = new InterProHandler(getItemWriter()); 
    6361        try { 
    6462            SAXParser.parse(new InputSource(reader), handler); 
     
    6866        } 
    6967 
    70     } 
    71  
    72     private void mapMaps() { 
    73         mapMaster.put("pubMaster", pubMaster); 
    74         mapMaster.put("dbMaster", dbMaster); 
    75         mapMaster.put("dsMaster", dsMaster); 
    76         mapMaster.put("proteinDomains", proteinDomains); 
    7768    } 
    7869 
     
    9081 
    9182        private Item domainRelationships; 
    92         private Map<String, Item> pubMaster; 
    93         private Map<String, Item> dbMaster; 
    94         private Map<String, Item> dsMaster; 
    95         private Map<String, Item> proteinDomains; 
    9683        private Item datasource; 
    9784        private Item dataset; 
     
    10996         * @param mapMaster the Map of maps 
    11097         */ 
    111         public InterProHandler(ItemWriter writer, Map mapMaster) { 
    112  
     98        public InterProHandler(ItemWriter writer) { 
    11399            this.writer = writer; 
    114             this.proteinDomains = (Map) mapMaster.get("proteinDomains"); 
    115             this.pubMaster = (Map) mapMaster.get("pubMaster"); 
    116             this.dbMaster = (Map) mapMaster.get("dbMaster"); 
    117             this.dsMaster = (Map) mapMaster.get("dsMaster"); 
    118100        } 
    119101 
     
    219201         */ 
    220202        public void characters(char[] ch, int start, int length) { 
     203            int st = start; 
     204            int l = length; 
    221205            if (attName != null) { 
    222206 
    223207                // DefaultHandler may call this method more than once for a single 
    224208                // attribute content -> hold text & create attribute in endElement 
    225                 while (length > 0) { 
     209                while (l > 0) { 
    226210                    boolean whitespace = false; 
    227                     switch(ch[start]) { 
     211                    switch(ch[st]) { 
    228212                    case ' ': 
    229213                    case '\r': 
     
    238222                        break; 
    239223                    } 
    240                     ++start
    241                     --length
    242                 } 
    243  
    244                 if (length > 0) { 
     224                    ++st
     225                    --l
     226                } 
     227 
     228                if (l > 0) { 
    245229                    StringBuffer s = new StringBuffer(); 
    246                     s.append(ch, start, length); 
     230                    s.append(ch, st, l); 
    247231                    attValue.append(s); 
    248232                    if (attName.equals("description")) { 
     
    268252 
    269253                    for (Item item : proteinDomains.values()) { 
    270                         writer.store(ItemHelper.convert(item)); 
     254                        createSynonym(item.getIdentifier(), "identifier", 
     255                                      item.getAttribute("primaryIdentifier").getValue(), 
     256                                      datasource.getIdentifier()); 
     257                        store(item); 
    271258                    } 
    272259 
    273260                    for (Item item : delayedItems) { 
    274                         writer.store(ItemHelper.convert(item)); 
     261                        store(item); 
    275262                    } 
    276263 
     
    312299            if (synonyms.get(key) == null) { 
    313300                Item syn = createItem("Synonym"); 
    314                 syn.addReference(new Reference("subject", subjectId)); 
     301                syn.setReference("subject", subjectId); 
    315302                syn.setAttribute("type", type); 
    316303                syn.setAttribute("value", value); 
    317                 syn.addReference(new Reference("source", dbId)); 
     304                syn.setReference("source", dbId); 
    318305                synonyms.put(key, syn); 
    319306                delayedItems.add(syn); 
    320307            } 
    321  
    322308        } 
    323309 
     
    339325            synonyms = new HashMap(); 
    340326 
    341             ReferenceList evidenceColl = new ReferenceList("dataSets", new ArrayList()); 
    342             proteinDomain.addCollection(evidenceColl); 
    343             evidenceColl.addRefId(dataset.getIdentifier()); 
     327            proteinDomain.setCollection("dataSets", 
     328                              new ArrayList(Collections.singleton(dataset.getIdentifier()))); 
    344329 
    345330            description = new StringBuffer(); 
    346  
    347331        } 
    348332 
  • trunk/bio/sources/interpro/test/resources/InterproConverterTest_tgt.xml

    r15816 r16232  
    11<items> 
     2<item id="2_2" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
     3<attribute name="primaryIdentifier" value="IPR100000"/> 
     4</item> 
     5<item id="2_7" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
     6<attribute name="primaryIdentifier" value="IPR400000"/> 
     7</item> 
    28<item id="0_4" class="http://www.flymine.org/model/genomic#DataSource"> 
    39<attribute name="name" value="Datasource 3"/> 
    4 </item> 
    5 <item id="2_3" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
    6 <attribute name="primaryIdentifier" value="IPR200001"/> 
    7 </item> 
    8 <item id="4_2" class="http://www.flymine.org/model/genomic#Publication"> 
    9 <attribute name="pubMedId" value="2222222"/> 
    1010</item> 
    1111<item id="5_5" class="http://www.flymine.org/model/genomic#Synonym"> 
     
    1515<reference name="subject" ref_id="2_1"/> 
    1616</item> 
     17<item id="4_2" class="http://www.flymine.org/model/genomic#Publication"> 
     18<attribute name="pubMedId" value="2222222"/> 
     19</item> 
     20<item id="2_4" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
     21<attribute name="primaryIdentifier" value="IPR200002"/> 
     22</item> 
    1723<item id="4_5" class="http://www.flymine.org/model/genomic#Publication"> 
    1824<attribute name="pubMedId" value="9999999"/> 
    1925</item> 
    20 <item id="2_2" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
    21 <attribute name="primaryIdentifier" value="IPR100000"/> 
     26<item id="3_2" class="http://www.flymine.org/model/genomic#DomainRelationship"> 
    2227</item> 
    23 <item id="2_5" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
    24 <attribute name="primaryIdentifier" value="IPR200003"/> 
    25 </item> 
    26 <item id="3_2" class="http://www.flymine.org/model/genomic#DomainRelationship"> 
     28<item id="2_8" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
     29<attribute name="description" value="&lt;p&gt;PAS domains are involved in many signalling proteins where theyare used as a signal sensor domain. PAS domains appear in archaea,bacteria and eukaryotes. Several PAS-domain proteins are known todetect their signal by way of an associated cofactor. Haeme,flavin, and a 4-hydroxycinnamyl chromophore are used in differentproteins. The PAS domain was named after three proteins that itoccurs in: &lt;/p&gt;&lt;li&gt;Per- period circadian protein&lt;/li&gt;&lt;li&gt;Arnt- Ah receptor nuclear translocator protein&lt;/li&gt;&lt;li&gt;Sim-  single-minded protein.&lt;/li&gt;&lt;p&gt;PAS domains are often associated withPAC domains .  It appears that these domains are directly linked, and that together they form the conserved 3D PAS fold.  The division between the PAS and PAC domains is caused by major differences in sequences in the region connecting these two motifs .  In human PAS kinase, this region has been shown to be very flexible, and adopts different conformations depending on the bound ligand .Probably the most surprising identification of a PAS domain was that inEAG-like K&lt;sup&gt;+&lt;/sup&gt;-channels .&lt;/p&gt;"/> 
     30<attribute name="name" value="monkey pants"/> 
     31<attribute name="primaryIdentifier" value="IPR00002"/> 
     32<attribute name="shortName" value="monkey"/> 
     33<attribute name="type" value="Domain"/> 
     34<reference name="domainRelationships" ref_id="3_2"/> 
     35<collection name="dataSets"><reference ref_id="1_1"/></collection> 
     36<collection name="publications"><reference ref_id="4_1"/><reference ref_id="4_2"/><reference ref_id="4_5"/></collection> 
    2737</item> 
    2838<item id="5_2" class="http://www.flymine.org/model/genomic#Synonym"> 
     
    4757<reference name="subject" ref_id="2_1"/> 
    4858</item> 
     59<item id="5_13" class="http://www.flymine.org/model/genomic#Synonym"> 
     60<attribute name="type" value="identifier"/> 
     61<attribute name="value" value="IPR200003"/> 
     62<reference name="source" ref_id="0_1"/> 
     63<reference name="subject" ref_id="2_5"/> 
     64</item> 
    4965<item id="0_2" class="http://www.flymine.org/model/genomic#DataSource"> 
    5066<attribute name="name" value="Datasource 1"/> 
    51 </item> 
    52 <item id="2_6" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
    53 <attribute name="primaryIdentifier" value="IPR300000"/> 
    5467</item> 
    5568<item id="5_1" class="http://www.flymine.org/model/genomic#Synonym"> 
     
    6578<attribute name="pubMedId" value="1111111"/> 
    6679</item> 
     80<item id="2_1" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
     81<attribute name="description" value="Kringles are autonomous structural domains, found throughout the blood clotting and fibrinolytic proteins.Kringle domains are believed to play a role in binding mediators (e.g., membranes,other proteins or phospholipids), and in the regulation of proteolytic activity, , . Kringle domains , , are characterised by a triple loop, 3-disulphide bridge structure, whose  conformation is defined by a number of hydrogen bonds and small pieces of  anti-parallel beta-sheet. They are found in a varying number  of  copies  in some plasma proteins including prothrombin and urokinase-type plasminogen activator, which are serine proteases belonging to MEROPS peptidase family S1A."/> 
     82<attribute name="name" value="supertubby"/> 
     83<attribute name="primaryIdentifier" value="IPR000001"/> 
     84<attribute name="shortName" value="tubby"/> 
     85<attribute name="type" value="Domain"/> 
     86<reference name="domainRelationships" ref_id="3_1"/> 
     87<collection name="dataSets"><reference ref_id="1_1"/></collection> 
     88<collection name="publications"><reference ref_id="4_1"/><reference ref_id="4_2"/><reference ref_id="4_3"/><reference ref_id="4_4"/></collection> 
     89</item> 
    6790<item id="3_1" class="http://www.flymine.org/model/genomic#DomainRelationship"> 
    6891<collection name="childFeatures"><reference ref_id="2_3"/><reference ref_id="2_4"/><reference ref_id="2_5"/></collection> 
     
    7497<attribute name="name" value="InterPro"/> 
    7598</item> 
    76 <item id="2_1" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
    77 <attribute name="description" value="Kringles are autonomous structural domains, found throughout the blood clotting and fibrinolytic proteins.Kringle domains are believed to play a role in binding mediators (e.g., membranes,other proteins or phospholipids), and in the regulation of proteolytic activity, , . Kringle domains , , are characterised by a triple loop, 3-disulphide bridge structure, whose  conformation is defined by a number of hydrogen bonds and small pieces of  anti-parallel beta-sheet. They are found in a varying number  of  copies  in some plasma proteins including prothrombin and urokinase-type plasminogen activator, which are serine proteases belonging to MEROPS peptidase family S1A."/> 
    78 <attribute name="primaryIdentifier" value="IPR000001"/> 
    79 <attribute name="name" value="supertubby"/> 
    80 <attribute name="shortName" value="tubby"/> 
    81 <attribute name="type" value="Domain"/> 
    82 <reference name="domainRelationships" ref_id="3_1"/> 
    83 <collection name="dataSets"><reference ref_id="1_1"/></collection> 
    84 <collection name="publications"><reference ref_id="4_1"/><reference ref_id="4_2"/><reference ref_id="4_3"/><reference ref_id="4_4"/></collection> 
     99<item id="2_3" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
     100<attribute name="primaryIdentifier" value="IPR200001"/> 
    85101</item> 
    86102<item id="5_6" class="http://www.flymine.org/model/genomic#Synonym"> 
     
    88104<attribute name="value" value="Synonym 4"/> 
    89105<reference name="source" ref_id="0_3"/> 
     106<reference name="subject" ref_id="2_1"/> 
     107</item> 
     108<item id="5_15" class="http://www.flymine.org/model/genomic#Synonym"> 
     109<attribute name="type" value="identifier"/> 
     110<attribute name="value" value="IPR00002"/> 
     111<reference name="source" ref_id="0_1"/> 
     112<reference name="subject" ref_id="2_8"/> 
     113</item> 
     114<item id="5_14" class="http://www.flymine.org/model/genomic#Synonym"> 
     115<attribute name="type" value="identifier"/> 
     116<attribute name="value" value="IPR000001"/> 
     117<reference name="source" ref_id="0_1"/> 
    90118<reference name="subject" ref_id="2_1"/> 
    91119</item> 
     
    96124<reference name="subject" ref_id="2_1"/> 
    97125</item> 
     126<item id="2_6" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
     127<attribute name="primaryIdentifier" value="IPR300000"/> 
     128</item> 
    98129<item id="1_1" class="http://www.flymine.org/model/genomic#DataSet"> 
    99130<attribute name="title" value="InterPro data set"/> 
     
    103134<attribute name="pubMedId" value="4444444"/> 
    104135</item> 
     136<item id="5_11" class="http://www.flymine.org/model/genomic#Synonym"> 
     137<attribute name="type" value="identifier"/> 
     138<attribute name="value" value="IPR100000"/> 
     139<reference name="source" ref_id="0_1"/> 
     140<reference name="subject" ref_id="2_2"/> 
     141</item> 
     142<item id="5_10" class="http://www.flymine.org/model/genomic#Synonym"> 
     143<attribute name="type" value="identifier"/> 
     144<attribute name="value" value="IPR300000"/> 
     145<reference name="source" ref_id="0_1"/> 
     146<reference name="subject" ref_id="2_6"/> 
     147</item> 
    105148<item id="5_3" class="http://www.flymine.org/model/genomic#Synonym"> 
    106149<attribute name="type" value="name"/> 
     
    109152<reference name="subject" ref_id="2_1"/> 
    110153</item> 
    111 <item id="2_8" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
    112 <attribute name="description" value="&lt;p&gt;PAS domains are involved in many signalling proteins where theyare used as a signal sensor domain. PAS domains appear in archaea,bacteria and eukaryotes. Several PAS-domain proteins are known todetect their signal by way of an associated cofactor. Haeme,flavin, and a 4-hydroxycinnamyl chromophore are used in differentproteins. The PAS domain was named after three proteins that itoccurs in: &lt;/p&gt;&lt;li&gt;Per- period circadian protein&lt;/li&gt;&lt;li&gt;Arnt- Ah receptor nuclear translocator protein&lt;/li&gt;&lt;li&gt;Sim-  single-minded protein.&lt;/li&gt;&lt;p&gt;PAS domains are often associated withPAC domains .  It appears that these domains are directly linked, and that together they form the conserved 3D PAS fold.  The division between the PAS and PAC domains is caused by major differences in sequences in the region connecting these two motifs .  In human PAS kinase, this region has been shown to be very flexible, and adopts different conformations depending on the bound ligand .Probably the most surprising identification of a PAS domain was that inEAG-like K&lt;sup&gt;+&lt;/sup&gt;-channels .&lt;/p&gt;"/> 
    113 <attribute name="primaryIdentifier" value="IPR00002"/> 
    114 <attribute name="name" value="monkey pants"/> 
    115 <attribute name="shortName" value="monkey"/> 
    116 <attribute name="type" value="Domain"/> 
    117 <reference name="domainRelationships" ref_id="3_2"/> 
    118 <collection name="dataSets"><reference ref_id="1_1"/></collection> 
    119 <collection name="publications"><reference ref_id="4_1"/><reference ref_id="4_2"/><reference ref_id="4_5"/></collection> 
     154<item id="5_16" class="http://www.flymine.org/model/genomic#Synonym"> 
     155<attribute name="type" value="identifier"/> 
     156<attribute name="value" value="IPR200002"/> 
     157<reference name="source" ref_id="0_1"/> 
     158<reference name="subject" ref_id="2_4"/> 
    120159</item> 
    121 <item id="2_7" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
    122 <attribute name="primaryIdentifier" value="IPR400000"/> 
     160<item id="5_9" class="http://www.flymine.org/model/genomic#Synonym"> 
     161<attribute name="type" value="identifier"/> 
     162<attribute name="value" value="IPR200001"/> 
     163<reference name="source" ref_id="0_1"/> 
     164<reference name="subject" ref_id="2_3"/> 
    123165</item> 
    124 <item id="2_4" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
    125 <attribute name="primaryIdentifier" value="IPR200002"/> 
     166<item id="5_12" class="http://www.flymine.org/model/genomic#Synonym"> 
     167<attribute name="type" value="identifier"/> 
     168<attribute name="value" value="IPR400000"/> 
     169<reference name="source" ref_id="0_1"/> 
     170<reference name="subject" ref_id="2_7"/> 
     171</item> 
     172<item id="2_5" class="http://www.flymine.org/model/genomic#ProteinDomain"> 
     173<attribute name="primaryIdentifier" value="IPR200003"/> 
    126174</item> 
    127175</items>