| 1 | package org.intermine.bio.dataconversion; |
|---|
| 2 | |
|---|
| 3 | /* |
|---|
| 4 | * Copyright (C) 2002-2011 FlyMine |
|---|
| 5 | * |
|---|
| 6 | * This code may be freely distributed and modified under the |
|---|
| 7 | * terms of the GNU Lesser General Public Licence. This should |
|---|
| 8 | * be distributed with the code. See the LICENSE file for more |
|---|
| 9 | * information or http://www.gnu.org/copyleft/lesser.html. |
|---|
| 10 | * |
|---|
| 11 | */ |
|---|
| 12 | |
|---|
| 13 | import java.lang.reflect.Constructor; |
|---|
| 14 | import java.sql.Connection; |
|---|
| 15 | import java.sql.ResultSet; |
|---|
| 16 | import java.sql.SQLException; |
|---|
| 17 | import java.sql.Statement; |
|---|
| 18 | import java.util.ArrayList; |
|---|
| 19 | import java.util.HashMap; |
|---|
| 20 | import java.util.HashSet; |
|---|
| 21 | import java.util.List; |
|---|
| 22 | import java.util.Map; |
|---|
| 23 | import java.util.Set; |
|---|
| 24 | |
|---|
| 25 | import org.apache.commons.lang.StringUtils; |
|---|
| 26 | import org.apache.log4j.Logger; |
|---|
| 27 | import org.intermine.bio.util.OrganismData; |
|---|
| 28 | import org.intermine.bio.util.OrganismRepository; |
|---|
| 29 | import org.intermine.dataconversion.ItemWriter; |
|---|
| 30 | import org.intermine.metadata.Model; |
|---|
| 31 | import org.intermine.sql.Database; |
|---|
| 32 | import org.intermine.util.StringUtil; |
|---|
| 33 | |
|---|
| 34 | /** |
|---|
| 35 | * DataConverter to read from a Chado database into items |
|---|
| 36 | * @author Kim Rutherford |
|---|
| 37 | */ |
|---|
| 38 | public class ChadoDBConverter extends BioDBConverter |
|---|
| 39 | { |
|---|
| 40 | protected static final Logger LOG = Logger.getLogger(ChadoDBConverter.class); |
|---|
| 41 | |
|---|
| 42 | // a Map from chado organism_id to taxonId |
|---|
| 43 | private final Map<Integer, OrganismData> chadoToOrgData = new HashMap<Integer, OrganismData>(); |
|---|
| 44 | private String processors = ""; |
|---|
| 45 | |
|---|
| 46 | private final Set<OrganismData> organismsToProcess = new HashSet<OrganismData>(); |
|---|
| 47 | |
|---|
| 48 | private final OrganismRepository organismRepository; |
|---|
| 49 | |
|---|
| 50 | private final List<ChadoProcessor> completedProcessors = new ArrayList<ChadoProcessor>(); |
|---|
| 51 | |
|---|
| 52 | private Connection connection; |
|---|
| 53 | |
|---|
| 54 | |
|---|
| 55 | /** |
|---|
| 56 | * Create a new ChadoDBConverter object. |
|---|
| 57 | * @param database the database to read from |
|---|
| 58 | * @param tgtModel the Model used by the object store we will write to with the ItemWriter |
|---|
| 59 | * @param writer an ItemWriter used to handle the resultant Items |
|---|
| 60 | * @throws SQLException if we fail to get a database connection |
|---|
| 61 | |
|---|
| 62 | */ |
|---|
| 63 | public ChadoDBConverter(Database database, Model tgtModel, ItemWriter writer) |
|---|
| 64 | throws SQLException { |
|---|
| 65 | super(database, tgtModel, writer, null, null); |
|---|
| 66 | organismRepository = OrganismRepository.getOrganismRepository(); |
|---|
| 67 | if (getDatabase() == null) { |
|---|
| 68 | // no Database when testing and no connection needed |
|---|
| 69 | connection = null; |
|---|
| 70 | } else { |
|---|
| 71 | connection = getDatabase().getConnection(); |
|---|
| 72 | } |
|---|
| 73 | } |
|---|
| 74 | |
|---|
| 75 | /** |
|---|
| 76 | * Set the taxon ids to use when creating the Organism Item for the new features. Only features |
|---|
| 77 | * from chado with these organisms will be processed. |
|---|
| 78 | * @param organisms a space separated list of the organism abbreviations or taxon ids to look |
|---|
| 79 | * up in the organism table eg. "Dmel Dpse" |
|---|
| 80 | */ |
|---|
| 81 | public void setOrganisms(String organisms) { |
|---|
| 82 | String[] bits = StringUtil.split(organisms, " "); |
|---|
| 83 | //for (int i = 0; i < bits.length; i++) { |
|---|
| 84 | for (String organismIdString: bits) { |
|---|
| 85 | OrganismData od = null; |
|---|
| 86 | try { |
|---|
| 87 | Integer taxonId = Integer.valueOf(organismIdString); |
|---|
| 88 | od = organismRepository.getOrganismDataByTaxon(taxonId); |
|---|
| 89 | } catch (NumberFormatException e) { |
|---|
| 90 | od = organismRepository.getOrganismDataByAbbreviation(organismIdString); |
|---|
| 91 | } |
|---|
| 92 | if (od == null) { |
|---|
| 93 | throw new RuntimeException("can't find organism for: " + organismIdString); |
|---|
| 94 | } |
|---|
| 95 | organismsToProcess.add(od); |
|---|
| 96 | } |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | /** |
|---|
| 100 | * Set the class names of the ChadoProcessors to run. |
|---|
| 101 | * @param processors a space separated list of the fully-qualified class names of module |
|---|
| 102 | * processors to run |
|---|
| 103 | */ |
|---|
| 104 | public void setProcessors(String processors) { |
|---|
| 105 | this.processors = processors; |
|---|
| 106 | } |
|---|
| 107 | |
|---|
| 108 | /** |
|---|
| 109 | * Return a map from chado organism_id to OrganismData object for all the organisms that we |
|---|
| 110 | * are processing |
|---|
| 111 | * @return the Map |
|---|
| 112 | */ |
|---|
| 113 | public Map<Integer, OrganismData> getChadoIdToOrgDataMap() { |
|---|
| 114 | return chadoToOrgData; |
|---|
| 115 | } |
|---|
| 116 | |
|---|
| 117 | /** |
|---|
| 118 | * Get the connection to use when processing. |
|---|
| 119 | * @return the Connection, or null while testing |
|---|
| 120 | */ |
|---|
| 121 | protected Connection getConnection() { |
|---|
| 122 | return connection; |
|---|
| 123 | } |
|---|
| 124 | |
|---|
| 125 | /** |
|---|
| 126 | * Process the data from the Database and write to the ItemWriter. |
|---|
| 127 | * {@inheritDoc} |
|---|
| 128 | */ |
|---|
| 129 | @Override |
|---|
| 130 | public void process() throws Exception { |
|---|
| 131 | |
|---|
| 132 | if (StringUtils.isEmpty(processors)) { |
|---|
| 133 | throw new IllegalArgumentException("processors not set in ChadoDBConverter"); |
|---|
| 134 | } |
|---|
| 135 | |
|---|
| 136 | Map<OrganismData, Integer> tempChadoOrgMap = getChadoOrganismIds(getConnection()); |
|---|
| 137 | |
|---|
| 138 | for (OrganismData od: organismsToProcess) { |
|---|
| 139 | Integer chadoId = tempChadoOrgMap.get(od); |
|---|
| 140 | if (chadoId == null) { |
|---|
| 141 | throw new RuntimeException("Organism " + od |
|---|
| 142 | + " not found in the chado organism table"); |
|---|
| 143 | } |
|---|
| 144 | chadoToOrgData.put(chadoId, od); |
|---|
| 145 | } |
|---|
| 146 | |
|---|
| 147 | if (chadoToOrgData.size() == 0) { |
|---|
| 148 | throw new RuntimeException("can't find any known organisms in the organism table"); |
|---|
| 149 | } |
|---|
| 150 | |
|---|
| 151 | String[] bits = processors.trim().split("[ \\t]+"); |
|---|
| 152 | for (int i = 0; i < bits.length; i++) { |
|---|
| 153 | String className = bits[i]; |
|---|
| 154 | if (!StringUtils.isEmpty(className)) { |
|---|
| 155 | Class<?> cls = Class.forName(className); |
|---|
| 156 | Constructor<?> constructor = cls.getDeclaredConstructor(ChadoDBConverter.class); |
|---|
| 157 | ChadoProcessor currentProcessor = (ChadoProcessor) constructor.newInstance(this); |
|---|
| 158 | currentProcessor.process(getConnection()); |
|---|
| 159 | getCompletedProcessors().add(currentProcessor); |
|---|
| 160 | } |
|---|
| 161 | } |
|---|
| 162 | } |
|---|
| 163 | |
|---|
| 164 | /** |
|---|
| 165 | * Return a map from chado organism id to OrganismData for the organisms in the organism table |
|---|
| 166 | * in chado. This is a protected method so that it can be overriden for testing |
|---|
| 167 | * @param conn the db connection |
|---|
| 168 | * @param organismsToProcess2 |
|---|
| 169 | * @return a Map from abbreviation to chado organism_id |
|---|
| 170 | * @throws SQLException if the is a database problem |
|---|
| 171 | */ |
|---|
| 172 | protected Map<OrganismData, Integer> getChadoOrganismIds(Connection conn) |
|---|
| 173 | throws SQLException { |
|---|
| 174 | String query = "select organism_id, abbreviation, genus, species from organism"; |
|---|
| 175 | LOG.info("executing: " + query); |
|---|
| 176 | Statement stmt = conn.createStatement(); |
|---|
| 177 | ResultSet res = stmt.executeQuery(query); |
|---|
| 178 | |
|---|
| 179 | Map<OrganismData, Integer> retMap = new HashMap<OrganismData, Integer>(); |
|---|
| 180 | |
|---|
| 181 | OrganismRepository or = OrganismRepository.getOrganismRepository(); |
|---|
| 182 | |
|---|
| 183 | while (res.next()) { |
|---|
| 184 | int organismId = res.getInt("organism_id"); |
|---|
| 185 | String abbreviation = res.getString("abbreviation"); |
|---|
| 186 | String genus = res.getString("genus"); |
|---|
| 187 | String species = res.getString("species"); |
|---|
| 188 | |
|---|
| 189 | OrganismData od = null; |
|---|
| 190 | |
|---|
| 191 | if (genus != null && species != null) { |
|---|
| 192 | od = or.getOrganismDataByGenusSpecies(genus, species); |
|---|
| 193 | } |
|---|
| 194 | |
|---|
| 195 | if (od == null) { |
|---|
| 196 | if (abbreviation != null) { |
|---|
| 197 | od = or.getOrganismDataByAbbreviation(abbreviation); |
|---|
| 198 | } |
|---|
| 199 | } |
|---|
| 200 | |
|---|
| 201 | if (od == null) { |
|---|
| 202 | LOG.warn("can't find OrganismData for species: " + species |
|---|
| 203 | + " genus: " + genus + " abbreviation: " + abbreviation); |
|---|
| 204 | } |
|---|
| 205 | |
|---|
| 206 | retMap.put(od, new Integer(organismId)); |
|---|
| 207 | } |
|---|
| 208 | |
|---|
| 209 | return retMap; |
|---|
| 210 | } |
|---|
| 211 | |
|---|
| 212 | /** |
|---|
| 213 | * Return the OrganismData objects for the organisms listed in the source configuration. |
|---|
| 214 | * @return the organismsToProcess |
|---|
| 215 | */ |
|---|
| 216 | public Set<OrganismData> getOrganismsToProcess() { |
|---|
| 217 | return organismsToProcess; |
|---|
| 218 | } |
|---|
| 219 | |
|---|
| 220 | /** |
|---|
| 221 | * Look at the list of completed processors and return the processor of the given type. If |
|---|
| 222 | * there is none or more than one, throw a RuntimeException |
|---|
| 223 | * @param cls the class |
|---|
| 224 | * @return the ChadoProcessor |
|---|
| 225 | */ |
|---|
| 226 | public ChadoProcessor findProcessor(Class<? extends ChadoProcessor> cls) { |
|---|
| 227 | ChadoProcessor returnProcessor = null; |
|---|
| 228 | |
|---|
| 229 | for (ChadoProcessor processor: getCompletedProcessors()) { |
|---|
| 230 | if (cls.isAssignableFrom(processor.getClass())) { |
|---|
| 231 | if (returnProcessor == null) { |
|---|
| 232 | returnProcessor = processor; |
|---|
| 233 | } else { |
|---|
| 234 | throw new RuntimeException("Completed processors list contains two objects of " |
|---|
| 235 | + "type: " + cls.getName()); |
|---|
| 236 | } |
|---|
| 237 | } |
|---|
| 238 | } |
|---|
| 239 | |
|---|
| 240 | if (returnProcessor == null) { |
|---|
| 241 | throw new RuntimeException("Can't find `" + cls.getName() + "` before `" |
|---|
| 242 | + this.getClass().getName() |
|---|
| 243 | + "` in the list of completed processors - must run " |
|---|
| 244 | + cls.getName() + " first."); |
|---|
| 245 | } |
|---|
| 246 | return returnProcessor; |
|---|
| 247 | } |
|---|
| 248 | /** |
|---|
| 249 | * Default implementation that makes a data set title based on the data source name. |
|---|
| 250 | * {@inheritDoc} |
|---|
| 251 | */ |
|---|
| 252 | @Override |
|---|
| 253 | public String getDataSetTitle(int taxonId) { |
|---|
| 254 | OrganismData od = organismRepository.getOrganismDataByTaxon(new Integer(taxonId)); |
|---|
| 255 | if (od != null) { |
|---|
| 256 | return getDataSourceName() + " data set for " + od.getGenus() + " " + od.getSpecies(); |
|---|
| 257 | } |
|---|
| 258 | return getDataSourceName() + " data set"; |
|---|
| 259 | } |
|---|
| 260 | |
|---|
| 261 | /** |
|---|
| 262 | * @return the completedProcessors |
|---|
| 263 | */ |
|---|
| 264 | public List<ChadoProcessor> getCompletedProcessors() { |
|---|
| 265 | return completedProcessors; |
|---|
| 266 | } |
|---|
| 267 | } |
|---|