Changeset 22646
- Timestamp:
- 02/09/10 10:53:54 (17 months ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/bio/sources/uniprot/main/src/org/intermine/bio/dataconversion/UniprotConverter.java
r22539 r22646 123 123 124 124 // process the sprot file, then the trembl file 125 private void processFiles(File[] files) 126 throws SAXException { 125 private void processFiles(File[] files) { 127 126 for (int i = 0; i <= 1; i++) { 128 127 File file = files[i]; … … 319 318 String domain = entry.getAttribute(); 320 319 if (domain.startsWith("IPR")) { 321 try { 322 entry.addDomainRefId(getInterpro(domain, getAttrValue(attrs, "value"))); 323 } catch (ObjectStoreException e) { 324 throw new SAXException(e); 325 } 320 entry.addDomainRefId(getInterpro(domain, getAttrValue(attrs, "value"))); 326 321 } 327 322 } else if (qName.equals("dbReference") && stack.peek().equals("citation") … … 472 467 473 468 474 private Set<UniprotEntry> processEntry(UniprotEntry entry)469 private Set<UniprotEntry> processEntry(UniprotEntry uniprotEntry) 475 470 throws SAXException, ObjectStoreException { 476 471 entryCount++; … … 480 475 Set<UniprotEntry> isoforms = new HashSet<UniprotEntry>(); 481 476 // have we already seen a protein for this organism with the same sequence? 482 if (!entry.isIsoform() && seenSequence(entry.getTaxonId(), entry.getMd5checksum())) { 477 if (!uniprotEntry.isIsoform() && seenSequence(uniprotEntry.getTaxonId(), 478 uniprotEntry.getMd5checksum())) { 483 479 // if we have seen this sequence before for this organism just add the 484 480 // primaryAccession of this protein as a synonym for the one already stored. 485 481 Map<String, String> orgSequences = sequences.get(taxonId); 486 if (orgSequences != null && orgSequences.containsKey(entry.getMd5checksum())) { 487 Item synonym = createSynonym(orgSequences.get(entry.getMd5checksum()), 488 entry.getPrimaryAccession(), false); 482 if (orgSequences != null 483 && orgSequences.containsKey(uniprotEntry.getMd5checksum())) { 484 Item synonym = createSynonym(orgSequences.get(uniprotEntry.getMd5checksum()), 485 uniprotEntry.getPrimaryAccession(), false); 489 486 synonymsAndXrefs.add(synonym); 490 487 } … … 493 490 494 491 // TODO there are uniparc entries so check for swissprot-trembl datasets 495 if ( entry.hasDatasetRefId() && entry.hasPrimaryAccession()) {496 497 setDataSet( entry.getDatasetRefId());498 499 for (String isoformAccession: entry.getIsoforms()) {500 isoforms.add( entry.createIsoformEntry(isoformAccession));492 if (uniprotEntry.hasDatasetRefId() && uniprotEntry.hasPrimaryAccession()) { 493 494 setDataSet(uniprotEntry.getDatasetRefId()); 495 496 for (String isoformAccession: uniprotEntry.getIsoforms()) { 497 isoforms.add(uniprotEntry.createIsoformEntry(isoformAccession)); 501 498 } 502 499 … … 504 501 505 502 /* primaryAccession, primaryIdentifier, name, etc */ 506 processIdentifiers(protein, entry);507 508 String isCanonical = ( entry.isIsoform() ? "false" : "true");503 processIdentifiers(protein, uniprotEntry); 504 505 String isCanonical = (uniprotEntry.isIsoform() ? "false" : "true"); 509 506 protein.setAttribute("isUniprotCanonical", isCanonical); 510 507 … … 513 510 514 511 /* sequence */ 515 if (! entry.isIsoform()) {516 processSequence(protein, entry);512 if (!uniprotEntry.isIsoform()) { 513 processSequence(protein, uniprotEntry); 517 514 } 518 515 519 516 /* interpro */ 520 if (createInterpro && ! entry.getDomains().isEmpty()) {521 protein.setCollection("proteinDomains", entry.getDomains());522 } 523 524 protein.setReference("organism", getOrganism( entry.getTaxonId()));517 if (createInterpro && !uniprotEntry.getDomains().isEmpty()) { 518 protein.setCollection("proteinDomains", uniprotEntry.getDomains()); 519 } 520 521 protein.setReference("organism", getOrganism(uniprotEntry.getTaxonId())); 525 522 526 523 /* publications */ 527 if (! entry.getPubs().isEmpty()) {528 protein.setCollection("publications", entry.getPubs());524 if (!uniprotEntry.getPubs().isEmpty()) { 525 protein.setCollection("publications", uniprotEntry.getPubs()); 529 526 } 530 527 531 528 /* comments */ 532 if (! entry.getComments().isEmpty()) {533 protein.setCollection("comments", entry.getComments());529 if (!uniprotEntry.getComments().isEmpty()) { 530 protein.setCollection("comments", uniprotEntry.getComments()); 534 531 } 535 532 536 533 /* keywords */ 537 if (! entry.getKeywords().isEmpty()) {538 protein.setCollection("keywords", entry.getKeywords());534 if (!uniprotEntry.getKeywords().isEmpty()) { 535 protein.setCollection("keywords", uniprotEntry.getKeywords()); 539 536 } 540 537 541 538 /* features */ 542 processFeatures(protein, entry);539 processFeatures(protein, uniprotEntry); 543 540 544 541 /* components */ 545 if (! entry.getComponents().isEmpty()) {546 processComponents(protein, entry);542 if (!uniprotEntry.getComponents().isEmpty()) { 543 processComponents(protein, uniprotEntry); 547 544 } 548 545 549 546 // record that we have seen this sequence for this organism 550 addSeenSequence( entry.getTaxonId(), entry.getMd5checksum(),547 addSeenSequence(uniprotEntry.getTaxonId(), uniprotEntry.getMd5checksum(), 551 548 protein.getIdentifier()); 552 549 553 550 try { 554 551 /* dbrefs (go terms, refseq) */ 555 processDbrefs(protein, entry);552 processDbrefs(protein, uniprotEntry); 556 553 557 554 /* genes */ 558 processGene(protein, entry);555 processGene(protein, uniprotEntry); 559 556 560 557 store(protein); 561 558 562 559 // create synonyms for accessions and store xrefs and synonyms we've collected 563 processSynonyms(protein.getIdentifier(), entry);560 processSynonyms(protein.getIdentifier(), uniprotEntry); 564 561 565 562 } catch (ObjectStoreException e) { … … 571 568 } 572 569 573 574 private void processSequence(Item protein, UniprotEntry entry) { 570 private void processSequence(Item protein, UniprotEntry uniprotEntry) { 575 571 Item item = createItem("Sequence"); 576 item.setAttribute("residues", entry.getSequence());577 item.setAttribute("length", entry.getLength());572 item.setAttribute("residues", uniprotEntry.getSequence()); 573 item.setAttribute("length", uniprotEntry.getLength()); 578 574 try { 579 575 store(item); … … 581 577 throw new RuntimeException(e); 582 578 } 583 protein.setAttribute("length", entry.getLength());579 protein.setAttribute("length", uniprotEntry.getLength()); 584 580 protein.setReference("sequence", item.getIdentifier()); 585 protein.setAttribute("molecularWeight", entry.getMolecularWeight());586 protein.setAttribute("md5checksum", entry.getMd5checksum());587 } 588 589 private void processIdentifiers(Item protein, UniprotEntry entry) {590 protein.setAttribute("name", entry.getName());591 protein.setAttribute("isFragment", entry.isFragment());592 protein.setAttribute("uniprotAccession", entry.getUniprotAccession());593 String primaryAccession = entry.getPrimaryAccession();581 protein.setAttribute("molecularWeight", uniprotEntry.getMolecularWeight()); 582 protein.setAttribute("md5checksum", uniprotEntry.getMd5checksum()); 583 } 584 585 private void processIdentifiers(Item protein, UniprotEntry uniprotEntry) { 586 protein.setAttribute("name", uniprotEntry.getName()); 587 protein.setAttribute("isFragment", uniprotEntry.isFragment()); 588 protein.setAttribute("uniprotAccession", uniprotEntry.getUniprotAccession()); 589 String primaryAccession = uniprotEntry.getPrimaryAccession(); 594 590 protein.setAttribute("primaryAccession", primaryAccession); 595 591 596 String primaryIdentifier = entry.getPrimaryIdentifier();592 String primaryIdentifier = uniprotEntry.getPrimaryIdentifier(); 597 593 protein.setAttribute("uniprotName", primaryIdentifier); 598 594 599 595 // primaryIdentifier must be unique, so append isoform suffix, eg -1 600 if ( entry.isIsoform()) {596 if (uniprotEntry.isIsoform()) { 601 597 primaryIdentifier = getIsoformIdentifier(primaryAccession, primaryIdentifier); 602 598 } … … 613 609 } 614 610 615 private void processComponents(Item protein, UniprotEntry entry)611 private void processComponents(Item protein, UniprotEntry uniprotEntry) 616 612 throws SAXException { 617 for (String componentName : entry.getComponents()) {613 for (String componentName : uniprotEntry.getComponents()) { 618 614 Item component = createItem("Component"); 619 615 component.setAttribute("name", componentName); … … 627 623 } 628 624 629 private void processFeatures(Item protein, UniprotEntry entry)625 private void processFeatures(Item protein, UniprotEntry uniprotEntry) 630 626 throws SAXException { 631 for (Item feature : entry.getFeatures()) {627 for (Item feature : uniprotEntry.getFeatures()) { 632 628 feature.setReference("protein", protein); 633 629 try { … … 639 635 } 640 636 641 private void processSynonyms(String proteinRefId, UniprotEntry entry)637 private void processSynonyms(String proteinRefId, UniprotEntry uniprotEntry) 642 638 throws SAXException, ObjectStoreException { 643 639 644 640 // accessions 645 for (String accession : entry.getAccessions()) {641 for (String accession : uniprotEntry.getAccessions()) { 646 642 createSynonym(proteinRefId, accession, true); 647 643 } 648 644 649 645 // primaryIdentifier if isoform 650 if ( entry.isIsoform()) {646 if (uniprotEntry.isIsoform()) { 651 647 String isoformIdentifier = 652 getIsoformIdentifier(entry.getPrimaryAccession(), entry.getPrimaryIdentifier()); 648 getIsoformIdentifier(uniprotEntry.getPrimaryAccession(), 649 uniprotEntry.getPrimaryIdentifier()); 653 650 createSynonym(proteinRefId, isoformIdentifier, true); 654 651 } 655 652 656 653 // name <recommendedName> or <alternateName> 657 for (String name : entry.getProteinNames()) {654 for (String name : uniprotEntry.getProteinNames()) { 658 655 createSynonym(proteinRefId, name, true); 659 656 } 660 657 661 658 // isoforms with extra identifiers 662 List<String> isoformSynonyms = entry.getIsoformSynonyms();659 List<String> isoformSynonyms = uniprotEntry.getIsoformSynonyms(); 663 660 if (!isoformSynonyms.isEmpty()) { 664 661 for (String identifier : isoformSynonyms) { … … 676 673 } 677 674 678 private void processDbrefs(Item protein, UniprotEntry entry)675 private void processDbrefs(Item protein, UniprotEntry uniprotEntry) 679 676 throws SAXException, ObjectStoreException { 680 Map<String, List<String>> dbrefs = entry.getDbrefs();677 Map<String, List<String>> dbrefs = uniprotEntry.getDbrefs(); 681 678 682 679 for (Map.Entry<String, List<String>> dbref : dbrefs.entrySet()) { … … 692 689 setCrossReference(protein.getIdentifier(), identifier, key, false); 693 690 if (creatego && key.equals("GO")) { 694 entry.addGOTerm(getGoTerm(identifier));691 uniprotEntry.addGOTerm(getGoTerm(identifier)); 695 692 } 696 693 } … … 710 707 } 711 708 712 private void processGoAnnotation(UniprotEntry entry, Item gene)709 private void processGoAnnotation(UniprotEntry uniprotEntry, Item gene) 713 710 throws SAXException { 714 for (String goTermRefId : entry.getGOTerms()) {711 for (String goTermRefId : uniprotEntry.getGOTerms()) { 715 712 Item goAnnotation = createItem("GOAnnotation"); 716 713 goAnnotation.setReference("subject", gene); … … 727 724 // gets the unique identifier and list of identifiers to set 728 725 // loops through each gene entry, assigns refId to protein 729 private void processGene(Item protein, UniprotEntry entry)726 private void processGene(Item protein, UniprotEntry uniprotEntry) 730 727 throws SAXException, ObjectStoreException { 731 String tax onId = entry.getTaxonId();728 String taxId = uniprotEntry.getTaxonId(); 732 729 733 730 // which gene.identifier field has to be unique 734 String uniqueIdentifierField = CONFIG.getUniqueIdentifier(tax onId);731 String uniqueIdentifierField = CONFIG.getUniqueIdentifier(taxId); 735 732 if (uniqueIdentifierField == null) { 736 733 uniqueIdentifierField = CONFIG.getUniqueIdentifier("default"); … … 738 735 739 736 // for this organism, set the following gene fields 740 Set<String> geneFields = CONFIG.getGeneIdentifierFields(tax onId);737 Set<String> geneFields = CONFIG.getGeneIdentifierFields(taxId); 741 738 if (geneFields == null) { 742 739 geneFields = CONFIG.getGeneIdentifierFields("default"); … … 744 741 745 742 // just one gene, don't have to worry about gene designations and dbrefs 746 if (!entry.hasMultipleGenes()) { 747 String geneRefId = createGene(entry, taxonId, geneFields, uniqueIdentifierField); 743 if (!uniprotEntry.hasMultipleGenes()) { 744 String geneRefId = createGene(uniprotEntry, taxId, geneFields, 745 uniqueIdentifierField); 748 746 if (geneRefId != null) { 749 747 protein.addToCollection("genes", geneRefId); … … 754 752 // loop through each gene entry to be processed 755 753 // cloning the gene removes dbrefs without gene designations 756 List<UniprotEntry> clonedEntries = entry.cloneGenes();754 List<UniprotEntry> clonedEntries = uniprotEntry.cloneGenes(); 757 755 Iterator<UniprotEntry> iter = clonedEntries.iterator(); 758 756 while (iter.hasNext()) { 759 757 // create a dummy entry and add identifiers for specific gene 760 String geneRefId = createGene(iter.next(), tax onId, geneFields,758 String geneRefId = createGene(iter.next(), taxId, geneFields, 761 759 uniqueIdentifierField); 762 760 if (StringUtils.isNotEmpty(geneRefId)) { … … 769 767 // sets the identifier fields specified in the config file 770 768 // creates synonym 771 private String createGene(UniprotEntry entry, String taxonId, Set<String> geneFields,769 private String createGene(UniprotEntry uniprotEntry, String taxId, Set<String> geneFields, 772 770 String uniqueIdentifierFieldType) 773 771 throws SAXException, ObjectStoreException { … … 775 773 List<String> geneSynonyms = new ArrayList<String>(); 776 774 777 String uniqueIdentifierValue = getGeneIdentifier( entry, taxonId,775 String uniqueIdentifierValue = getGeneIdentifier(uniprotEntry, taxId, 778 776 uniqueIdentifierFieldType, geneSynonyms, true); 779 777 if (uniqueIdentifierValue == null) { … … 792 790 continue; 793 791 } 794 String identifier = getGeneIdentifier( entry, taxonId, geneField, geneSynonyms,795 false);792 String identifier = getGeneIdentifier(uniprotEntry, taxId, geneField, 793 geneSynonyms, false); 796 794 797 795 if (identifier == null) { … … 804 802 * identifier will always be a duplicate in this case. 805 803 */ 806 if (! entry.isIsoform() && geneIdentifiers.contains(identifier)) {804 if (!uniprotEntry.isIsoform() && geneIdentifiers.contains(identifier)) { 807 805 // TODO this should create a synonym 808 806 LOG.error("not assigning duplicate identifier: " + identifier); … … 817 815 818 816 if (creatego) { 819 processGoAnnotation( entry, gene);817 processGoAnnotation(uniprotEntry, gene); 820 818 } 821 819 822 820 // store gene 823 821 try { 824 gene.setReference("organism", getOrganism(tax onId));822 gene.setReference("organism", getOrganism(taxId)); 825 823 store(gene); 826 824 } catch (ObjectStoreException e) { … … 840 838 // gets the identifier for a gene from the dbref/names collected from the XML 841 839 // which identifier is chosen depends on the configuration in the uniprot config file 842 private String getGeneIdentifier(UniprotEntry entry, String taxonId, String identifierType,843 List<String> geneSynonyms, boolean isUniqueIdentifier) {840 private String getGeneIdentifier(UniprotEntry uniprotEntry, String taxId, 841 String identifierType, List<String> geneSynonyms, boolean isUniqueIdentifier) { 844 842 845 843 String identifierValue = null; 846 844 // how to get the identifier, eg. dbref OR name 847 String method = CONFIG.getIdentifierMethod(tax onId, identifierType);845 String method = CONFIG.getIdentifierMethod(taxId, identifierType); 848 846 // what value to use with method, eg. "FlyBase" or "ORF" 849 String value = CONFIG.getIdentifierValue(tax onId, identifierType);847 String value = CONFIG.getIdentifierValue(taxId, identifierType); 850 848 851 849 if (method == null || value == null) { … … 855 853 if (method == null || value == null) { 856 854 throw new RuntimeException("error processing line in config file for organism " 857 + tax onId);855 + taxId); 858 856 } 859 857 } 860 858 861 859 if (method.equals("name")) { 862 if ( entry.getGeneNames() == null || entry.getGeneNames().isEmpty()) {863 LOG.error("No gene names for " + tax onId + ". protein accession:"864 + entry.getPrimaryAccession());860 if (uniprotEntry.getGeneNames() == null || uniprotEntry.getGeneNames().isEmpty()) { 861 LOG.error("No gene names for " + taxId + ". protein accession:" 862 + uniprotEntry.getPrimaryAccession()); 865 863 return null; 866 864 } 867 identifierValue = entry.getGeneNames().get(value);865 identifierValue = uniprotEntry.getGeneNames().get(value); 868 866 } else if (method.equals("dbref")) { 869 867 if (value.equals("Ensembl")) { 870 868 // See #2122 871 identifierValue = entry.getGeneDesignation("Ensembl");869 identifierValue = uniprotEntry.getGeneDesignation("Ensembl"); 872 870 } else { 873 Map<String, List<String>> dbrefs = entry.getDbrefs();871 Map<String, List<String>> dbrefs = uniprotEntry.getDbrefs(); 874 872 String msg = "no " + value + " identifier found for gene attached to protein: " 875 + entry.getPrimaryAccession();873 + uniprotEntry.getPrimaryAccession(); 876 874 if (dbrefs == null || dbrefs.isEmpty()) { 877 875 LOG.error(msg); … … 884 882 } 885 883 // TODO handle multiple identifiers somehow 886 identifierValue = entry.getDbrefs().get(value).get(0);884 identifierValue = uniprotEntry.getDbrefs().get(value).get(0); 887 885 } 888 886 } else { 889 LOG.error("error processing line in config file for organism " + tax onId);887 LOG.error("error processing line in config file for organism " + taxId); 890 888 return null; 891 889 } 892 890 geneSynonyms.add(identifierValue); 893 if (isUniqueIdentifier && tax onId.equals("7227")) {894 identifierValue = resolveGene(tax onId, identifierValue);891 if (isUniqueIdentifier && taxId.equals("7227")) { 892 identifierValue = resolveGene(taxId, identifierValue); 895 893 896 894 // try again! 897 if (identifierValue == null && entry.getGeneNames() != null898 && ! entry.getGeneNames().isEmpty()) {899 Iterator<String> iter = entry.getGeneNames().values().iterator();895 if (identifierValue == null && uniprotEntry.getGeneNames() != null 896 && !uniprotEntry.getGeneNames().isEmpty()) { 897 Iterator<String> iter = uniprotEntry.getGeneNames().values().iterator(); 900 898 while (iter.hasNext() && identifierValue == null) { 901 identifierValue = resolveGene(tax onId, iter.next());899 identifierValue = resolveGene(taxId, iter.next()); 902 900 } 903 901 } … … 906 904 } 907 905 908 private String resolveGene(String tax onId, String identifier) {906 private String resolveGene(String taxId, String identifier) { 909 907 flyResolver = resolverFactory.getIdResolver(false); 910 908 if (flyResolver == null) { … … 912 910 return identifier; 913 911 } 914 int resCount = flyResolver.countResolutions(tax onId, identifier);912 int resCount = flyResolver.countResolutions(taxId, identifier); 915 913 if (resCount != 1) { 916 914 LOG.info("RESOLVER: failed to resolve gene to one identifier, ignoring gene: " 917 915 + identifier + " count: " + resCount + " FBgn: " 918 + flyResolver.resolveId(tax onId, identifier));916 + flyResolver.resolveId(taxId, identifier)); 919 917 return null; 920 918 } 921 return flyResolver.resolveId(taxonId, identifier).iterator().next(); 922 } 923 } 924 925 private void addSeenSequence(String taxonId, String md5checksum, String proteinIdentifier) 926 throws SAXException { 919 return flyResolver.resolveId(taxId, identifier).iterator().next(); 920 } 921 } 922 923 private void addSeenSequence(String taxonId, String md5checksum, String proteinIdentifier) { 927 924 Map<String, String> orgSequences = sequences.get(taxonId); 928 925 if (orgSequences == null) { … … 935 932 } 936 933 937 private boolean seenSequence(String taxonId, String md5checksum) 938 throws SAXException { 934 private boolean seenSequence(String taxonId, String md5checksum) { 939 935 Map<String, String> orgSequences = sequences.get(taxonId); 940 936 if (orgSequences == null) { … … 983 979 984 980 private String getInterpro(String identifier, String shortName) 985 throws SAXException , ObjectStoreException{981 throws SAXException { 986 982 String refId = domains.get(identifier); 987 983 if (refId == null) {
Note: See TracChangeset
for help on using the changeset viewer.
