Changeset 16232
- Timestamp:
- 15/07/08 16:20:23 (1 month ago)
- Files:
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/bio/sources/interpro/.classpath
r9860 r16232 11 11 <classpathentry combineaccessrules="false" kind="src" path="/intermine-integrate-test"/> 12 12 <classpathentry combineaccessrules="false" kind="src" path="/bio-postprocess-main"/> 13 <classpathentry kind="lib" path="test/resources"/> 13 14 <classpathentry kind="output" path="bin"/> 14 15 </classpath> trunk/bio/sources/interpro/main/src/org/intermine/bio/dataconversion/InterProConverter.java
r15808 r16232 13 13 import java.io.Reader; 14 14 import java.util.ArrayList; 15 import java.util.Collections; 15 16 import java.util.HashMap; 16 17 import java.util.Map; … … 38 39 public class InterProConverter extends FileConverter 39 40 { 40 private Map<String, Object> mapMaster = new HashMap<String, Object>();41 41 protected static final String GENOMIC_NS = "http://www.flymine.org/model/genomic#"; 42 42 private Map<String, Item> pubMaster = new HashMap<String, Item>(); … … 58 58 */ 59 59 public void process(Reader reader) throws Exception { 60 mapMaps(); 61 InterProHandler handler = new InterProHandler(getItemWriter(), mapMaster); 62 60 InterProHandler handler = new InterProHandler(getItemWriter()); 63 61 try { 64 62 SAXParser.parse(new InputSource(reader), handler); … … 68 66 } 69 67 70 }71 72 private void mapMaps() {73 mapMaster.put("pubMaster", pubMaster);74 mapMaster.put("dbMaster", dbMaster);75 mapMaster.put("dsMaster", dsMaster);76 mapMaster.put("proteinDomains", proteinDomains);77 68 } 78 69 … … 90 81 91 82 private Item domainRelationships; 92 private Map<String, Item> pubMaster;93 private Map<String, Item> dbMaster;94 private Map<String, Item> dsMaster;95 private Map<String, Item> proteinDomains;96 83 private Item datasource; 97 84 private Item dataset; … … 109 96 * @param mapMaster the Map of maps 110 97 */ 111 public InterProHandler(ItemWriter writer, Map mapMaster) { 112 98 public InterProHandler(ItemWriter writer) { 113 99 this.writer = writer; 114 this.proteinDomains = (Map) mapMaster.get("proteinDomains");115 this.pubMaster = (Map) mapMaster.get("pubMaster");116 this.dbMaster = (Map) mapMaster.get("dbMaster");117 this.dsMaster = (Map) mapMaster.get("dsMaster");118 100 } 119 101 … … 219 201 */ 220 202 public void characters(char[] ch, int start, int length) { 203 int st = start; 204 int l = length; 221 205 if (attName != null) { 222 206 223 207 // DefaultHandler may call this method more than once for a single 224 208 // attribute content -> hold text & create attribute in endElement 225 while (l ength> 0) {209 while (l > 0) { 226 210 boolean whitespace = false; 227 switch(ch[st art]) {211 switch(ch[st]) { 228 212 case ' ': 229 213 case '\r': … … 238 222 break; 239 223 } 240 ++st art;241 --l ength;242 } 243 244 if (l ength> 0) {224 ++st; 225 --l; 226 } 227 228 if (l > 0) { 245 229 StringBuffer s = new StringBuffer(); 246 s.append(ch, st art, length);230 s.append(ch, st, l); 247 231 attValue.append(s); 248 232 if (attName.equals("description")) { … … 268 252 269 253 for (Item item : proteinDomains.values()) { 270 writer.store(ItemHelper.convert(item)); 254 createSynonym(item.getIdentifier(), "identifier", 255 item.getAttribute("primaryIdentifier").getValue(), 256 datasource.getIdentifier()); 257 store(item); 271 258 } 272 259 273 260 for (Item item : delayedItems) { 274 writer.store(ItemHelper.convert(item));261 store(item); 275 262 } 276 263 … … 312 299 if (synonyms.get(key) == null) { 313 300 Item syn = createItem("Synonym"); 314 syn. addReference(new Reference("subject", subjectId));301 syn.setReference("subject", subjectId); 315 302 syn.setAttribute("type", type); 316 303 syn.setAttribute("value", value); 317 syn. addReference(new Reference("source", dbId));304 syn.setReference("source", dbId); 318 305 synonyms.put(key, syn); 319 306 delayedItems.add(syn); 320 307 } 321 322 308 } 323 309 … … 339 325 synonyms = new HashMap(); 340 326 341 ReferenceList evidenceColl = new ReferenceList("dataSets", new ArrayList()); 342 proteinDomain.addCollection(evidenceColl); 343 evidenceColl.addRefId(dataset.getIdentifier()); 327 proteinDomain.setCollection("dataSets", 328 new ArrayList(Collections.singleton(dataset.getIdentifier()))); 344 329 345 330 description = new StringBuffer(); 346 347 331 } 348 332 trunk/bio/sources/interpro/test/resources/InterproConverterTest_tgt.xml
r15816 r16232 1 1 <items> 2 <item id="2_2" class="http://www.flymine.org/model/genomic#ProteinDomain"> 3 <attribute name="primaryIdentifier" value="IPR100000"/> 4 </item> 5 <item id="2_7" class="http://www.flymine.org/model/genomic#ProteinDomain"> 6 <attribute name="primaryIdentifier" value="IPR400000"/> 7 </item> 2 8 <item id="0_4" class="http://www.flymine.org/model/genomic#DataSource"> 3 9 <attribute name="name" value="Datasource 3"/> 4 </item>5 <item id="2_3" class="http://www.flymine.org/model/genomic#ProteinDomain">6 <attribute name="primaryIdentifier" value="IPR200001"/>7 </item>8 <item id="4_2" class="http://www.flymine.org/model/genomic#Publication">9 <attribute name="pubMedId" value="2222222"/>10 10 </item> 11 11 <item id="5_5" class="http://www.flymine.org/model/genomic#Synonym"> … … 15 15 <reference name="subject" ref_id="2_1"/> 16 16 </item> 17 <item id="4_2" class="http://www.flymine.org/model/genomic#Publication"> 18 <attribute name="pubMedId" value="2222222"/> 19 </item> 20 <item id="2_4" class="http://www.flymine.org/model/genomic#ProteinDomain"> 21 <attribute name="primaryIdentifier" value="IPR200002"/> 22 </item> 17 23 <item id="4_5" class="http://www.flymine.org/model/genomic#Publication"> 18 24 <attribute name="pubMedId" value="9999999"/> 19 25 </item> 20 <item id="2_2" class="http://www.flymine.org/model/genomic#ProteinDomain"> 21 <attribute name="primaryIdentifier" value="IPR100000"/> 26 <item id="3_2" class="http://www.flymine.org/model/genomic#DomainRelationship"> 22 27 </item> 23 <item id="2_5" class="http://www.flymine.org/model/genomic#ProteinDomain"> 24 <attribute name="primaryIdentifier" value="IPR200003"/> 25 </item> 26 <item id="3_2" class="http://www.flymine.org/model/genomic#DomainRelationship"> 28 <item id="2_8" class="http://www.flymine.org/model/genomic#ProteinDomain"> 29 <attribute name="description" value="<p>PAS domains are involved in many signalling proteins where theyare used as a signal sensor domain. PAS domains appear in archaea,bacteria and eukaryotes. Several PAS-domain proteins are known todetect their signal by way of an associated cofactor. Haeme,flavin, and a 4-hydroxycinnamyl chromophore are used in differentproteins. The PAS domain was named after three proteins that itoccurs in: </p><li>Per- period circadian protein</li><li>Arnt- Ah receptor nuclear translocator protein</li><li>Sim- single-minded protein.</li><p>PAS domains are often associated withPAC domains . It appears that these domains are directly linked, and that together they form the conserved 3D PAS fold. The division between the PAS and PAC domains is caused by major differences in sequences in the region connecting these two motifs . In human PAS kinase, this region has been shown to be very flexible, and adopts different conformations depending on the bound ligand .Probably the most surprising identification of a PAS domain was that inEAG-like K<sup>+</sup>-channels .</p>"/> 30 <attribute name="name" value="monkey pants"/> 31 <attribute name="primaryIdentifier" value="IPR00002"/> 32 <attribute name="shortName" value="monkey"/> 33 <attribute name="type" value="Domain"/> 34 <reference name="domainRelationships" ref_id="3_2"/> 35 <collection name="dataSets"><reference ref_id="1_1"/></collection> 36 <collection name="publications"><reference ref_id="4_1"/><reference ref_id="4_2"/><reference ref_id="4_5"/></collection> 27 37 </item> 28 38 <item id="5_2" class="http://www.flymine.org/model/genomic#Synonym"> … … 47 57 <reference name="subject" ref_id="2_1"/> 48 58 </item> 59 <item id="5_13" class="http://www.flymine.org/model/genomic#Synonym"> 60 <attribute name="type" value="identifier"/> 61 <attribute name="value" value="IPR200003"/> 62 <reference name="source" ref_id="0_1"/> 63 <reference name="subject" ref_id="2_5"/> 64 </item> 49 65 <item id="0_2" class="http://www.flymine.org/model/genomic#DataSource"> 50 66 <attribute name="name" value="Datasource 1"/> 51 </item>52 <item id="2_6" class="http://www.flymine.org/model/genomic#ProteinDomain">53 <attribute name="primaryIdentifier" value="IPR300000"/>54 67 </item> 55 68 <item id="5_1" class="http://www.flymine.org/model/genomic#Synonym"> … … 65 78 <attribute name="pubMedId" value="1111111"/> 66 79 </item> 80 <item id="2_1" class="http://www.flymine.org/model/genomic#ProteinDomain"> 81 <attribute name="description" value="Kringles are autonomous structural domains, found throughout the blood clotting and fibrinolytic proteins.Kringle domains are believed to play a role in binding mediators (e.g., membranes,other proteins or phospholipids), and in the regulation of proteolytic activity, , . Kringle domains , , are characterised by a triple loop, 3-disulphide bridge structure, whose conformation is defined by a number of hydrogen bonds and small pieces of anti-parallel beta-sheet. They are found in a varying number of copies in some plasma proteins including prothrombin and urokinase-type plasminogen activator, which are serine proteases belonging to MEROPS peptidase family S1A."/> 82 <attribute name="name" value="supertubby"/> 83 <attribute name="primaryIdentifier" value="IPR000001"/> 84 <attribute name="shortName" value="tubby"/> 85 <attribute name="type" value="Domain"/> 86 <reference name="domainRelationships" ref_id="3_1"/> 87 <collection name="dataSets"><reference ref_id="1_1"/></collection> 88 <collection name="publications"><reference ref_id="4_1"/><reference ref_id="4_2"/><reference ref_id="4_3"/><reference ref_id="4_4"/></collection> 89 </item> 67 90 <item id="3_1" class="http://www.flymine.org/model/genomic#DomainRelationship"> 68 91 <collection name="childFeatures"><reference ref_id="2_3"/><reference ref_id="2_4"/><reference ref_id="2_5"/></collection> … … 74 97 <attribute name="name" value="InterPro"/> 75 98 </item> 76 <item id="2_1" class="http://www.flymine.org/model/genomic#ProteinDomain"> 77 <attribute name="description" value="Kringles are autonomous structural domains, found throughout the blood clotting and fibrinolytic proteins.Kringle domains are believed to play a role in binding mediators (e.g., membranes,other proteins or phospholipids), and in the regulation of proteolytic activity, , . Kringle domains , , are characterised by a triple loop, 3-disulphide bridge structure, whose conformation is defined by a number of hydrogen bonds and small pieces of anti-parallel beta-sheet. They are found in a varying number of copies in some plasma proteins including prothrombin and urokinase-type plasminogen activator, which are serine proteases belonging to MEROPS peptidase family S1A."/> 78 <attribute name="primaryIdentifier" value="IPR000001"/> 79 <attribute name="name" value="supertubby"/> 80 <attribute name="shortName" value="tubby"/> 81 <attribute name="type" value="Domain"/> 82 <reference name="domainRelationships" ref_id="3_1"/> 83 <collection name="dataSets"><reference ref_id="1_1"/></collection> 84 <collection name="publications"><reference ref_id="4_1"/><reference ref_id="4_2"/><reference ref_id="4_3"/><reference ref_id="4_4"/></collection> 99 <item id="2_3" class="http://www.flymine.org/model/genomic#ProteinDomain"> 100 <attribute name="primaryIdentifier" value="IPR200001"/> 85 101 </item> 86 102 <item id="5_6" class="http://www.flymine.org/model/genomic#Synonym"> … … 88 104 <attribute name="value" value="Synonym 4"/> 89 105 <reference name="source" ref_id="0_3"/> 106 <reference name="subject" ref_id="2_1"/> 107 </item> 108 <item id="5_15" class="http://www.flymine.org/model/genomic#Synonym"> 109 <attribute name="type" value="identifier"/> 110 <attribute name="value" value="IPR00002"/> 111 <reference name="source" ref_id="0_1"/> 112 <reference name="subject" ref_id="2_8"/> 113 </item> 114 <item id="5_14" class="http://www.flymine.org/model/genomic#Synonym"> 115 <attribute name="type" value="identifier"/> 116 <attribute name="value" value="IPR000001"/> 117 <reference name="source" ref_id="0_1"/> 90 118 <reference name="subject" ref_id="2_1"/> 91 119 </item> … … 96 124 <reference name="subject" ref_id="2_1"/> 97 125 </item> 126 <item id="2_6" class="http://www.flymine.org/model/genomic#ProteinDomain"> 127 <attribute name="primaryIdentifier" value="IPR300000"/> 128 </item> 98 129 <item id="1_1" class="http://www.flymine.org/model/genomic#DataSet"> 99 130 <attribute name="title" value="InterPro data set"/> … … 103 134 <attribute name="pubMedId" value="4444444"/> 104 135 </item> 136 <item id="5_11" class="http://www.flymine.org/model/genomic#Synonym"> 137 <attribute name="type" value="identifier"/> 138 <attribute name="value" value="IPR100000"/> 139 <reference name="source" ref_id="0_1"/> 140 <reference name="subject" ref_id="2_2"/> 141 </item> 142 <item id="5_10" class="http://www.flymine.org/model/genomic#Synonym"> 143 <attribute name="type" value="identifier"/> 144 <attribute name="value" value="IPR300000"/> 145 <reference name="source" ref_id="0_1"/> 146 <reference name="subject" ref_id="2_6"/> 147 </item> 105 148 <item id="5_3" class="http://www.flymine.org/model/genomic#Synonym"> 106 149 <attribute name="type" value="name"/> … … 109 152 <reference name="subject" ref_id="2_1"/> 110 153 </item> 111 <item id="2_8" class="http://www.flymine.org/model/genomic#ProteinDomain"> 112 <attribute name="description" value="<p>PAS domains are involved in many signalling proteins where theyare used as a signal sensor domain. PAS domains appear in archaea,bacteria and eukaryotes. Several PAS-domain proteins are known todetect their signal by way of an associated cofactor. Haeme,flavin, and a 4-hydroxycinnamyl chromophore are used in differentproteins. The PAS domain was named after three proteins that itoccurs in: </p><li>Per- period circadian protein</li><li>Arnt- Ah receptor nuclear translocator protein</li><li>Sim- single-minded protein.</li><p>PAS domains are often associated withPAC domains . It appears that these domains are directly linked, and that together they form the conserved 3D PAS fold. The division between the PAS and PAC domains is caused by major differences in sequences in the region connecting these two motifs . In human PAS kinase, this region has been shown to be very flexible, and adopts different conformations depending on the bound ligand .Probably the most surprising identification of a PAS domain was that inEAG-like K<sup>+</sup>-channels .</p>"/> 113 <attribute name="primaryIdentifier" value="IPR00002"/> 114 <attribute name="name" value="monkey pants"/> 115 <attribute name="shortName" value="monkey"/> 116 <attribute name="type" value="Domain"/> 117 <reference name="domainRelationships" ref_id="3_2"/> 118 <collection name="dataSets"><reference ref_id="1_1"/></collection> 119 <collection name="publications"><reference ref_id="4_1"/><reference ref_id="4_2"/><reference ref_id="4_5"/></collection> 154 <item id="5_16" class="http://www.flymine.org/model/genomic#Synonym"> 155 <attribute name="type" value="identifier"/> 156 <attribute name="value" value="IPR200002"/> 157 <reference name="source" ref_id="0_1"/> 158 <reference name="subject" ref_id="2_4"/> 120 159 </item> 121 <item id="2_7" class="http://www.flymine.org/model/genomic#ProteinDomain"> 122 <attribute name="primaryIdentifier" value="IPR400000"/> 160 <item id="5_9" class="http://www.flymine.org/model/genomic#Synonym"> 161 <attribute name="type" value="identifier"/> 162 <attribute name="value" value="IPR200001"/> 163 <reference name="source" ref_id="0_1"/> 164 <reference name="subject" ref_id="2_3"/> 123 165 </item> 124 <item id="2_4" class="http://www.flymine.org/model/genomic#ProteinDomain"> 125 <attribute name="primaryIdentifier" value="IPR200002"/> 166 <item id="5_12" class="http://www.flymine.org/model/genomic#Synonym"> 167 <attribute name="type" value="identifier"/> 168 <attribute name="value" value="IPR400000"/> 169 <reference name="source" ref_id="0_1"/> 170 <reference name="subject" ref_id="2_7"/> 171 </item> 172 <item id="2_5" class="http://www.flymine.org/model/genomic#ProteinDomain"> 173 <attribute name="primaryIdentifier" value="IPR200003"/> 126 174 </item> 127 175 </items>
