source: trunk/bio/sources/chado-db/main/src/org/intermine/bio/dataconversion/ChadoDBConverter.java @ 24567

Revision 24567, 9.5 KB checked in by julie, 17 months ago (diff)

update copyright notice

Line 
1package org.intermine.bio.dataconversion;
2
3/*
4 * Copyright (C) 2002-2011 FlyMine
5 *
6 * This code may be freely distributed and modified under the
7 * terms of the GNU Lesser General Public Licence.  This should
8 * be distributed with the code.  See the LICENSE file for more
9 * information or http://www.gnu.org/copyleft/lesser.html.
10 *
11 */
12
13import java.lang.reflect.Constructor;
14import java.sql.Connection;
15import java.sql.ResultSet;
16import java.sql.SQLException;
17import java.sql.Statement;
18import java.util.ArrayList;
19import java.util.HashMap;
20import java.util.HashSet;
21import java.util.List;
22import java.util.Map;
23import java.util.Set;
24
25import org.apache.commons.lang.StringUtils;
26import org.apache.log4j.Logger;
27import org.intermine.bio.util.OrganismData;
28import org.intermine.bio.util.OrganismRepository;
29import org.intermine.dataconversion.ItemWriter;
30import org.intermine.metadata.Model;
31import org.intermine.sql.Database;
32import org.intermine.util.StringUtil;
33
34/**
35 * DataConverter to read from a Chado database into items
36 * @author Kim Rutherford
37 */
38public class ChadoDBConverter extends BioDBConverter
39{
40    protected static final Logger LOG = Logger.getLogger(ChadoDBConverter.class);
41
42    // a Map from chado organism_id to taxonId
43    private final Map<Integer, OrganismData> chadoToOrgData = new HashMap<Integer, OrganismData>();
44    private String processors = "";
45
46    private final Set<OrganismData> organismsToProcess = new HashSet<OrganismData>();
47
48    private final OrganismRepository organismRepository;
49
50    private final List<ChadoProcessor> completedProcessors = new ArrayList<ChadoProcessor>();
51
52    private Connection connection;
53
54
55    /**
56     * Create a new ChadoDBConverter object.
57     * @param database the database to read from
58     * @param tgtModel the Model used by the object store we will write to with the ItemWriter
59     * @param writer an ItemWriter used to handle the resultant Items
60     * @throws SQLException if we fail to get a database connection
61
62     */
63    public ChadoDBConverter(Database database, Model tgtModel, ItemWriter writer)
64        throws SQLException {
65        super(database, tgtModel, writer, null, null);
66        organismRepository = OrganismRepository.getOrganismRepository();
67        if (getDatabase() == null) {
68            // no Database when testing and no connection needed
69            connection = null;
70        } else {
71            connection = getDatabase().getConnection();
72        }
73    }
74
75    /**
76     * Set the taxon ids to use when creating the Organism Item for the new features.  Only features
77     * from chado with these organisms will be processed.
78     * @param organisms a space separated list of the organism abbreviations or taxon ids to look
79     * up in the organism table eg. "Dmel Dpse"
80     */
81    public void setOrganisms(String organisms) {
82        String[] bits = StringUtil.split(organisms, " ");
83        //for (int i = 0; i < bits.length; i++) {
84        for (String organismIdString: bits) {
85            OrganismData od = null;
86            try {
87                Integer taxonId = Integer.valueOf(organismIdString);
88                od = organismRepository.getOrganismDataByTaxon(taxonId);
89            } catch (NumberFormatException e) {
90                od = organismRepository.getOrganismDataByAbbreviation(organismIdString);
91            }
92            if (od == null) {
93                throw new RuntimeException("can't find organism for: " + organismIdString);
94            }
95            organismsToProcess.add(od);
96        }
97    }
98
99    /**
100     * Set the class names of the ChadoProcessors to run.
101     * @param processors a space separated list of the fully-qualified class names of module
102     * processors to run
103     */
104    public void setProcessors(String processors) {
105        this.processors = processors;
106    }
107
108    /**
109     * Return a map from chado organism_id to OrganismData object for all the organisms that we
110     * are processing
111     * @return the Map
112     */
113    public Map<Integer, OrganismData> getChadoIdToOrgDataMap() {
114        return chadoToOrgData;
115    }
116
117    /**
118     * Get the connection to use when processing.
119     * @return the Connection, or null while testing
120     */
121    protected Connection getConnection() {
122        return connection;
123    }
124
125    /**
126     * Process the data from the Database and write to the ItemWriter.
127     * {@inheritDoc}
128     */
129    @Override
130    public void process() throws Exception {
131
132        if (StringUtils.isEmpty(processors)) {
133            throw new IllegalArgumentException("processors not set in ChadoDBConverter");
134        }
135
136        Map<OrganismData, Integer> tempChadoOrgMap = getChadoOrganismIds(getConnection());
137
138        for (OrganismData od: organismsToProcess) {
139            Integer chadoId = tempChadoOrgMap.get(od);
140            if (chadoId == null) {
141                throw new RuntimeException("Organism " + od
142                                           + " not found in the chado organism table");
143            }
144            chadoToOrgData.put(chadoId, od);
145        }
146
147        if (chadoToOrgData.size() == 0) {
148            throw new RuntimeException("can't find any known organisms in the organism table");
149        }
150
151        String[] bits = processors.trim().split("[ \\t]+");
152        for (int i = 0; i < bits.length; i++) {
153            String className = bits[i];
154            if (!StringUtils.isEmpty(className)) {
155                Class<?> cls = Class.forName(className);
156                Constructor<?> constructor = cls.getDeclaredConstructor(ChadoDBConverter.class);
157                ChadoProcessor currentProcessor = (ChadoProcessor) constructor.newInstance(this);
158                currentProcessor.process(getConnection());
159                getCompletedProcessors().add(currentProcessor);
160            }
161        }
162    }
163
164    /**
165     * Return a map from chado organism id to OrganismData for the organisms in the organism table
166     * in chado.  This is a protected method so that it can be overriden for testing
167     * @param conn the db connection
168     * @param organismsToProcess2
169     * @return a Map from abbreviation to chado organism_id
170     * @throws SQLException if the is a database problem
171     */
172    protected Map<OrganismData, Integer> getChadoOrganismIds(Connection conn)
173        throws SQLException {
174        String query = "select organism_id, abbreviation, genus, species from organism";
175        LOG.info("executing: " + query);
176        Statement stmt = conn.createStatement();
177        ResultSet res = stmt.executeQuery(query);
178
179        Map<OrganismData, Integer> retMap = new HashMap<OrganismData, Integer>();
180
181        OrganismRepository or = OrganismRepository.getOrganismRepository();
182
183        while (res.next()) {
184            int organismId = res.getInt("organism_id");
185            String abbreviation = res.getString("abbreviation");
186            String genus = res.getString("genus");
187            String species = res.getString("species");
188
189            OrganismData od = null;
190
191            if (genus != null && species != null) {
192                od = or.getOrganismDataByGenusSpecies(genus, species);
193            }
194
195            if (od == null) {
196                if (abbreviation != null) {
197                    od = or.getOrganismDataByAbbreviation(abbreviation);
198                }
199            }
200
201            if (od == null) {
202                LOG.warn("can't find OrganismData for species: " + species
203                         + " genus: " + genus + " abbreviation: " + abbreviation);
204            }
205
206            retMap.put(od, new Integer(organismId));
207        }
208
209        return retMap;
210    }
211
212    /**
213     * Return the OrganismData objects for the organisms listed in the source configuration.
214     * @return the organismsToProcess
215     */
216    public Set<OrganismData> getOrganismsToProcess() {
217        return organismsToProcess;
218    }
219
220    /**
221     * Look at the list of completed processors and return the processor of the given type.  If
222     * there is none or more than one, throw a RuntimeException
223     * @param cls the class
224     * @return the ChadoProcessor
225     */
226    public ChadoProcessor findProcessor(Class<? extends ChadoProcessor> cls) {
227        ChadoProcessor returnProcessor = null;
228
229        for (ChadoProcessor processor: getCompletedProcessors()) {
230            if (cls.isAssignableFrom(processor.getClass())) {
231                if (returnProcessor == null) {
232                    returnProcessor = processor;
233                } else {
234                    throw new RuntimeException("Completed processors list contains two objects of "
235                                               + "type: " + cls.getName());
236                }
237            }
238        }
239
240        if (returnProcessor == null) {
241            throw new RuntimeException("Can't find `" + cls.getName() + "` before `"
242                                       + this.getClass().getName()
243                                       + "` in the list of completed processors - must run "
244                                       + cls.getName() + " first.");
245        }
246        return returnProcessor;
247    }
248    /**
249     * Default implementation that makes a data set title based on the data source name.
250     * {@inheritDoc}
251     */
252    @Override
253    public String getDataSetTitle(int taxonId) {
254        OrganismData od = organismRepository.getOrganismDataByTaxon(new Integer(taxonId));
255        if (od != null) {
256            return getDataSourceName() + " data set for " + od.getGenus() + " " + od.getSpecies();
257        }
258        return getDataSourceName() + " data set";
259    }
260
261    /**
262     * @return the completedProcessors
263     */
264    public List<ChadoProcessor> getCompletedProcessors() {
265        return completedProcessors;
266    }
267}
Note: See TracBrowser for help on using the repository browser.