|
1 | | -/* |
2 | | - * The Gemma project |
3 | | - * |
4 | | - * Copyright (c) 2011 University of British Columbia |
5 | | - * |
6 | | - * Licensed under the Apache License, Version 2.0 (the "License"); |
7 | | - * you may not use this file except in compliance with the License. |
8 | | - * You may obtain a copy of the License at |
9 | | - * |
10 | | - * http://www.apache.org/licenses/LICENSE-2.0 |
11 | | - * |
12 | | - * Unless required by applicable law or agreed to in writing, software |
13 | | - * distributed under the License is distributed on an "AS IS" BASIS, |
14 | | - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | - * See the License for the specific language governing permissions and |
16 | | - * limitations under the License. |
17 | | - * |
18 | | - */ |
19 | | - |
20 | | -package ubic.gemma.core.analysis.preprocess.batcheffects; |
21 | | - |
22 | | -import org.apache.commons.logging.Log; |
23 | | -import org.apache.commons.logging.LogFactory; |
24 | | - |
25 | | -import java.io.BufferedReader; |
26 | | -import java.io.IOException; |
27 | | -import java.text.DateFormat; |
28 | | -import java.text.ParseException; |
29 | | -import java.text.SimpleDateFormat; |
30 | | -import java.util.Calendar; |
31 | | -import java.util.Date; |
32 | | -import java.util.Locale; |
33 | | -import java.util.regex.Matcher; |
34 | | -import java.util.regex.Pattern; |
35 | | - |
36 | | -/** |
37 | | - * @author paul |
38 | | - */ |
39 | | -@SuppressWarnings("WeakerAccess") // Possibly accessed via CLI tools |
40 | | -public abstract class BaseScanDateExtractor implements ScanDateExtractor { |
41 | | - |
42 | | - protected static final String GENEPIX_DATETIME_HEADER_REGEXP = "\"?DateTime=.*"; |
43 | | - private static final String STANDARD_FORMAT_REGEX_2 = ".+?([0-9]{2}[/-][0-9]{2}[/-]\\s[0-9]\\s[0-9]{2}:[0-9]{2}:[0-9]{2}).+"; |
44 | | - private static final String STANDARD_FORMAT_REGEX = ".+?([0-9]{2}[/-][0-9]{2}[/-][0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2}).+"; |
45 | | - private static final String LONG_FORMAT_REGEX = "\\s*Date\\s*(.+)"; |
46 | | - private static final Log log = LogFactory.getLog( BaseScanDateExtractor.class ); |
47 | | - |
48 | | - /** |
49 | | - * This method should be generic for GenePix/GPR/ATR file formats. Has DateType at the top formatted with quotes: |
50 | | - * "DateTime=2005/11/09 11:36:27". Example GSE15739 |
51 | | - * For more information see <a href='http://mdc.custhelp.com/app/answers/detail/a_id/18886'>here</a>. |
52 | | - * |
53 | | - * @param reader the reader |
54 | | - * @return date |
55 | | - * @throws IOException when there was a read error |
56 | | - */ |
57 | | - protected Date extractGenePix( BufferedReader reader ) throws IOException { |
58 | | - String line; |
59 | | - // GPR/ATF file. Read a few lines to find the datetime (the header tells us how long the header is, but |
60 | | - // this is probably okay) |
61 | | - Date d = null; |
62 | | - while ( ( line = reader.readLine() ) != null ) { |
63 | | - |
64 | | - if ( line.matches( BaseScanDateExtractor.GENEPIX_DATETIME_HEADER_REGEXP ) ) { |
65 | | - d = this.parseGenePixDateTime( line ); |
66 | | - break; |
67 | | - } |
68 | | - } |
69 | | - |
70 | | - if ( d == null ) { |
71 | | - throw new IllegalStateException( "Failed to find the 'DateTime' line" ); |
72 | | - } |
73 | | - reader.close(); |
74 | | - return d; |
75 | | - } |
76 | | - |
77 | | - /** |
78 | | - * @param line like "DateTime=2005/11/09 11:36:27" (with the quotes) possibly with trailing whitespace. |
79 | | - * @return date |
80 | | - */ |
81 | | - protected Date parseGenePixDateTime( String line ) { |
82 | | - String dateString = line.trim().replaceAll( "\"", "" ).replaceFirst( "DateTime=", "" ); |
83 | | - try { |
84 | | - |
85 | | - DateFormat f = new SimpleDateFormat( "yyyy/MM/dd HH:mm:ss", Locale.ENGLISH ); // 2005/11/09 11:36:27, 2006/04/07 14:18:18 |
86 | | - return f.parse( dateString ); |
87 | | - } catch ( ParseException e ) { |
88 | | - try { |
89 | | - /* |
90 | | - * Another format we see in GPR files ... 2008:11:27 10:27:42 |
91 | | - */ |
92 | | - DateFormat f = new SimpleDateFormat( "yyyy:MM:dd HH:mm:ss", Locale.ENGLISH ); // 2005/11/09 11:36:27, 2006/04/07 |
93 | | - // 14:18:18 |
94 | | - return f.parse( dateString ); |
95 | | - } catch ( ParseException e1 ) { |
96 | | - throw new RuntimeException( e1 ); |
97 | | - } |
98 | | - |
99 | | - } |
100 | | - } |
101 | | - |
102 | | - /** |
103 | | - * @param string ISO 8601 date time in WSTRING format based on Universal Time Clock UTC (UTC is also known as GMT, or Greenwich |
104 | | - * Mean Time) E.g. "2005-11-23T13:45:53Z" |
105 | | - * @return date |
106 | | - */ |
107 | | - protected Date parseISO8601( String string ) { |
108 | | - try { |
109 | | - Calendar f; |
110 | | - f = javax.xml.bind.DatatypeConverter.parseDateTime( string ); |
111 | | - return f.getTime(); |
112 | | - } catch ( Exception e ) { |
113 | | - return null; |
114 | | - } |
115 | | - |
116 | | - } |
117 | | - |
118 | | - /** |
119 | | - * @param string E.g. "Mon Jun 17 21:26:34 CST 2002", but line has to have Date at start (possibly white-space padded) Shows up in |
120 | | - * Imagene files. |
121 | | - * @return date |
122 | | - */ |
123 | | - protected Date parseLongFormat( String string ) { |
124 | | - try { |
125 | | - DateFormat f = new SimpleDateFormat( "EEE MMM dd HH:mm:ss zzz yyyy", Locale.ENGLISH ); |
126 | | - |
127 | | - Pattern regex = Pattern.compile( BaseScanDateExtractor.LONG_FORMAT_REGEX ); |
128 | | - |
129 | | - Matcher matcher = regex.matcher( string ); |
130 | | - if ( matcher.matches() ) { |
131 | | - String tok = matcher.group( 1 ); |
132 | | - return f.parse( tok ); |
133 | | - } |
134 | | - |
135 | | - return null; |
136 | | - } catch ( ParseException e ) { |
137 | | - throw new RuntimeException( e ); |
138 | | - } |
139 | | - } |
140 | | - |
141 | | - /** |
142 | | - * @param string Parse a common format, "MM[/-]dd[/-]yy hh:mm:ss", found for example in the "DatHeader" line from a CEL file and |
143 | | - * extract the date found there. |
144 | | - * @return date |
145 | | - */ |
146 | | - protected Date parseStandardFormat( String string ) { |
147 | | - |
148 | | - try { |
149 | | - DateFormat f = new SimpleDateFormat( "MM/dd/yy HH:mm:ss", Locale.ENGLISH ); |
150 | | - |
151 | | - Pattern regex = Pattern.compile( BaseScanDateExtractor.STANDARD_FORMAT_REGEX ); |
152 | | - |
153 | | - Matcher matcher = regex.matcher( string ); |
154 | | - if ( matcher.matches() ) { |
155 | | - String tok = matcher.group( 1 ); |
156 | | - return f.parse( tok ); |
157 | | - } |
158 | | - |
159 | | - /* |
160 | | - * For some reason, it is common to get things like "08/26/ 3 12:30:45" - I infer that is supposed to be a |
161 | | - * 03. |
162 | | - */ |
163 | | - Pattern regex2 = Pattern.compile( BaseScanDateExtractor.STANDARD_FORMAT_REGEX_2 ); |
164 | | - matcher = regex2.matcher( string ); |
165 | | - if ( matcher.matches() ) { |
166 | | - String tok = matcher.group( 1 ); |
167 | | - tok = tok.replaceFirst( "\\s", "0" ); |
168 | | - Date d = f.parse( tok ); |
169 | | - BaseScanDateExtractor.log |
170 | | - .warn( "Year was partly missing from date line: " + string + ", inferred " + d ); |
171 | | - return d; |
172 | | - } |
173 | | - |
174 | | - return null; |
175 | | - } catch ( ParseException e ) { |
176 | | - throw new RuntimeException( e ); |
177 | | - } |
178 | | - |
179 | | - } |
180 | | - |
181 | | -} |
| 1 | +/* |
| 2 | + * The Gemma project |
| 3 | + * |
| 4 | + * Copyright (c) 2011 University of British Columbia |
| 5 | + * |
| 6 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | + * you may not use this file except in compliance with the License. |
| 8 | + * You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, software |
| 13 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | + * See the License for the specific language governing permissions and |
| 16 | + * limitations under the License. |
| 17 | + * |
| 18 | + */ |
| 19 | + |
| 20 | +package ubic.gemma.core.analysis.preprocess.batcheffects; |
| 21 | + |
| 22 | +import org.apache.commons.logging.Log; |
| 23 | +import org.apache.commons.logging.LogFactory; |
| 24 | + |
| 25 | +import java.io.BufferedReader; |
| 26 | +import java.io.IOException; |
| 27 | +import java.text.DateFormat; |
| 28 | +import java.text.ParseException; |
| 29 | +import java.text.SimpleDateFormat; |
| 30 | +import java.util.Calendar; |
| 31 | +import java.util.Date; |
| 32 | +import java.util.Locale; |
| 33 | +import java.util.regex.Matcher; |
| 34 | +import java.util.regex.Pattern; |
| 35 | + |
| 36 | +/** |
| 37 | + * @author paul |
| 38 | + */ |
| 39 | +@SuppressWarnings("WeakerAccess") // Possibly accessed via CLI tools |
| 40 | +public abstract class BaseScanDateExtractor implements ScanDateExtractor { |
| 41 | + |
| 42 | + protected static final String GENEPIX_DATETIME_HEADER_REGEXP = "\"?DateTime=.*"; |
| 43 | + private static final String STANDARD_FORMAT_REGEX_2 = ".+?([0-9]{2}[/-][0-9]{2}[/-]\\s[0-9]\\s[0-9]{2}:[0-9]{2}:[0-9]{2}).+"; |
| 44 | + private static final String STANDARD_FORMAT_REGEX = ".+?([0-9]{2}[/-][0-9]{2}[/-][0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2}).+"; |
| 45 | + private static final String LONG_FORMAT_REGEX = "\\s*Date\\s*(.+)"; |
| 46 | + private static final Log log = LogFactory.getLog( BaseScanDateExtractor.class ); |
| 47 | + |
| 48 | + /** |
| 49 | + * This method should be generic for GenePix/GPR/ATR file formats. Has DateType at the top formatted with quotes: |
| 50 | + * "DateTime=2005/11/09 11:36:27". Example GSE15739 |
| 51 | + * For more information see <a href='http://mdc.custhelp.com/app/answers/detail/a_id/18886'>here</a>. |
| 52 | + * |
| 53 | + * @param reader the reader |
| 54 | + * @return date |
| 55 | + * @throws IOException when there was a read error |
| 56 | + */ |
| 57 | + protected Date extractGenePix( BufferedReader reader ) throws IOException { |
| 58 | + String line; |
| 59 | + // GPR/ATF file. Read a few lines to find the datetime (the header tells us how long the header is, but |
| 60 | + // this is probably okay) |
| 61 | + Date d = null; |
| 62 | + while ( ( line = reader.readLine() ) != null ) { |
| 63 | + |
| 64 | + if ( line.matches( BaseScanDateExtractor.GENEPIX_DATETIME_HEADER_REGEXP ) ) { |
| 65 | + d = this.parseGenePixDateTime( line ); |
| 66 | + break; |
| 67 | + } |
| 68 | + } |
| 69 | + |
| 70 | + if ( d == null ) { |
| 71 | + throw new IllegalStateException( "Failed to find the 'DateTime' line" ); |
| 72 | + } |
| 73 | + reader.close(); |
| 74 | + return d; |
| 75 | + } |
| 76 | + |
| 77 | + /** |
| 78 | + * @param line like "DateTime=2005/11/09 11:36:27" (with the quotes) possibly with trailing whitespace. |
| 79 | + * @return date |
| 80 | + */ |
| 81 | + protected Date parseGenePixDateTime( String line ) { |
| 82 | + String dateString = line.trim().replaceAll( "\"", "" ).replaceFirst( "DateTime=", "" ); |
| 83 | + try { |
| 84 | + |
| 85 | + DateFormat f = new SimpleDateFormat( "yyyy/MM/dd HH:mm:ss", Locale.ENGLISH ); // 2005/11/09 11:36:27, 2006/04/07 14:18:18 |
| 86 | + return f.parse( dateString ); |
| 87 | + } catch ( ParseException e ) { |
| 88 | + try { |
| 89 | + /* |
| 90 | + * Another format we see in GPR files ... 2008:11:27 10:27:42 |
| 91 | + */ |
| 92 | + DateFormat f = new SimpleDateFormat( "yyyy:MM:dd HH:mm:ss", Locale.ENGLISH ); // 2005/11/09 11:36:27, 2006/04/07 |
| 93 | + // 14:18:18 |
| 94 | + return f.parse( dateString ); |
| 95 | + } catch ( ParseException e1 ) { |
| 96 | + throw new RuntimeException( e1 ); |
| 97 | + } |
| 98 | + |
| 99 | + } |
| 100 | + } |
| 101 | + |
| 102 | + /** |
| 103 | + * @param string ISO 8601 date time in WSTRING format based on Universal Time Clock UTC (UTC is also known as GMT, or Greenwich |
| 104 | + * Mean Time) E.g. "2005-11-23T13:45:53Z" |
| 105 | + * @return date |
| 106 | + */ |
| 107 | + protected Date parseISO8601( String string ) { |
| 108 | + try { |
| 109 | + Calendar f; |
| 110 | + f = javax.xml.bind.DatatypeConverter.parseDateTime( string ); |
| 111 | + return f.getTime(); |
| 112 | + } catch ( Exception e ) { |
| 113 | + return null; |
| 114 | + } |
| 115 | + |
| 116 | + } |
| 117 | + |
| 118 | + /** |
| 119 | + * @param string E.g. "Mon Jun 17 21:26:34 CST 2002", but line has to have Date at start (possibly white-space padded) Shows up in |
| 120 | + * Imagene files. |
| 121 | + * @return date |
| 122 | + */ |
| 123 | + protected Date parseLongFormat( String string ) { |
| 124 | + try { |
| 125 | + DateFormat f = new SimpleDateFormat( "EEE MMM dd HH:mm:ss zzz yyyy", Locale.ENGLISH ); |
| 126 | + |
| 127 | + Pattern regex = Pattern.compile( BaseScanDateExtractor.LONG_FORMAT_REGEX ); |
| 128 | + |
| 129 | + Matcher matcher = regex.matcher( string ); |
| 130 | + if ( matcher.matches() ) { |
| 131 | + String tok = matcher.group( 1 ); |
| 132 | + return f.parse( tok ); |
| 133 | + } |
| 134 | + |
| 135 | + return null; |
| 136 | + } catch ( ParseException e ) { |
| 137 | + throw new RuntimeException( e ); |
| 138 | + } |
| 139 | + } |
| 140 | + |
| 141 | + /** |
| 142 | + * @param string Parse a common format, "MM[/-]dd[/-]yy hh:mm:ss", found for example in the "DatHeader" line from a CEL file and |
| 143 | + * extract the date found there. |
| 144 | + * @return date |
| 145 | + */ |
| 146 | + protected Date parseStandardFormat( String string ) { |
| 147 | + |
| 148 | + try { |
| 149 | + DateFormat f = new SimpleDateFormat( "MM/dd/yy HH:mm:ss", Locale.ENGLISH ); |
| 150 | + |
| 151 | + Pattern regex = Pattern.compile( BaseScanDateExtractor.STANDARD_FORMAT_REGEX ); |
| 152 | + |
| 153 | + Matcher matcher = regex.matcher( string ); |
| 154 | + if ( matcher.matches() ) { |
| 155 | + String tok = matcher.group( 1 ); |
| 156 | + return f.parse( tok ); |
| 157 | + } |
| 158 | + |
| 159 | + /* |
| 160 | + * For some reason, it is common to get things like "08/26/ 3 12:30:45" - I infer that is supposed to be a |
| 161 | + * 03. |
| 162 | + */ |
| 163 | + Pattern regex2 = Pattern.compile( BaseScanDateExtractor.STANDARD_FORMAT_REGEX_2 ); |
| 164 | + matcher = regex2.matcher( string ); |
| 165 | + if ( matcher.matches() ) { |
| 166 | + String tok = matcher.group( 1 ); |
| 167 | + tok = tok.replaceFirst( "\\s", "0" ); |
| 168 | + Date d = f.parse( tok ); |
| 169 | + BaseScanDateExtractor.log |
| 170 | + .warn( "Year was partly missing from date line: " + string + ", inferred " + d ); |
| 171 | + return d; |
| 172 | + } |
| 173 | + |
| 174 | + return null; |
| 175 | + } catch ( ParseException e ) { |
| 176 | + throw new RuntimeException( e ); |
| 177 | + } |
| 178 | + |
| 179 | + } |
| 180 | + |
| 181 | +} |
0 commit comments