1111import org .biojava .nbio .structure .io .CifFileReader ;
1212import org .biojava .nbio .structure .io .FileParsingParameters ;
1313import org .biojava .nbio .structure .io .LocalPDBDirectory ;
14- import org .biojava .nbio .structure .io .MMCIFFileReader ;
15- import org .biojava .nbio .structure .io .MMTFFileReader ;
1614import org .biojava .nbio .structure .io .PDBFileParser ;
17- import org .biojava .nbio .structure .io .cif .CifFileConverter ;
18- import org .junit .Ignore ;
1915import org .junit .Test ;
20- import org .rcsb .cif .CifReader ;
2116
2217import java .io .IOException ;
2318import java .io .InputStream ;
24- import java .io .UncheckedIOException ;
25- import java .nio .file .Files ;
26- import java .nio .file .Path ;
27- import java .nio .file .Paths ;
28- import java .util .ArrayList ;
29- import java .util .Collections ;
30- import java .util .Date ;
3119import java .util .List ;
32- import java .util .concurrent .atomic .AtomicInteger ;
33- import java .util .regex .Pattern ;
34- import java .util .zip .GZIPInputStream ;
3520
36- import static org .junit .Assert .assertNotNull ;
37- import static org .junit .Assert .assertEquals ;
38- import static org .junit .Assert .assertTrue ;
39- import static org .junit .Assert .fail ;
21+ import static org .junit .Assert .*;
4022
4123public class CifFileConsumerIntegrationTest {
4224 private static boolean headerOnly ;
@@ -70,29 +52,29 @@ public void testLoadHeaderOnlyBinary() throws IOException {
7052
7153 private void doTestLoad () throws IOException {
7254 // test a simple protein
73- comparePDB2cif ("5pti" ,"A" );
55+ comparePDB2cif ("5pti" , "A" );
7456
7557 // test a protein with modified residues
76- comparePDB2cif ("1a4w" ,"L" );
77- comparePDB2cif ("1a4w" ,"H" );
78- comparePDB2cif ("1a4w" ,"I" );
58+ comparePDB2cif ("1a4w" , "L" );
59+ comparePDB2cif ("1a4w" , "H" );
60+ comparePDB2cif ("1a4w" , "I" );
7961
8062 //non-standard encoded amino acid
81- comparePDB2cif ("1fdo" ,"A" );
63+ comparePDB2cif ("1fdo" , "A" );
8264
8365 // test a DNA binding protein
84- comparePDB2cif ("1j59" ,"A" );
85- comparePDB2cif ("1j59" ,"E" );
66+ comparePDB2cif ("1j59" , "A" );
67+ comparePDB2cif ("1j59" , "E" );
8668
8769 // test a NMR protein
88- comparePDB2cif ("2kc9" ,"A" );
70+ comparePDB2cif ("2kc9" , "A" );
8971 }
9072
9173 private void comparePDB2cif (String id , String chainId ) throws IOException {
9274 String fileName = binary ? "/" + id + ".bcif" : "/" + id + ".cif" ;
9375 System .out .println (fileName );
9476 InputStream inStream = getClass ().getResourceAsStream (fileName );
95- assertNotNull ("Could not find file " + fileName + ". Config problem?" , inStream );
77+ assertNotNull ("Could not find file " + fileName + ". Config problem?" , inStream );
9678
9779 LocalPDBDirectory reader = binary ? new BcifFileReader () : new CifFileReader ();
9880
@@ -120,7 +102,7 @@ private void comparePDB2cif(String id, String chainId) throws IOException {
120102 pdbStructure .isNmr (),
121103 cifStructure .isNmr ());
122104
123- if (pdbStructure .isNmr ()){
105+ if (pdbStructure .isNmr ()) {
124106 assertEquals (id + ": the nr of NMR models is not the same!" ,
125107 pdbStructure .nrModels (),
126108 pdbStructure .nrModels ());
@@ -143,7 +125,7 @@ private void comparePDB2cif(String id, String chainId) throws IOException {
143125 a_cif .getAtomGroups (GroupType .AMINOACID ).size ());
144126
145127 // actually this check not necessarily works, since there can be waters in PDB that we don;t deal with yet in cif...
146- for (int i = 0 ; i < a_pdb .getAtomGroups (GroupType .AMINOACID ).size (); i ++){
128+ for (int i = 0 ; i < a_pdb .getAtomGroups (GroupType .AMINOACID ).size (); i ++) {
147129 Group gp = a_pdb .getAtomGroups (GroupType .AMINOACID ).get (i );
148130 List <Group > cifGroups = a_cif .getAtomGroups (GroupType .AMINOACID );
149131 Group gc = cifGroups .get (i );
@@ -155,8 +137,8 @@ private void comparePDB2cif(String id, String chainId) throws IOException {
155137
156138 assertEquals ("the sequences obtained from PDB and mmCif don't match!" , pdb_seq , cif_seq );
157139
158- List <DBRef > pdb_dbrefs = pdbStructure .getDBRefs ();
159- List <DBRef > cif_dbrefs = cifStructure .getDBRefs ();
140+ List <DBRef > pdb_dbrefs = pdbStructure .getDBRefs ();
141+ List <DBRef > cif_dbrefs = cifStructure .getDBRefs ();
160142
161143 assertEquals ("nr of DBrefs found does not match!" , pdb_dbrefs .size (), cif_dbrefs .size ());
162144
@@ -180,7 +162,7 @@ private void comparePDB2cif(String id, String chainId) throws IOException {
180162 h2 .toPDB ().toUpperCase ());
181163 }
182164
183- private void checkGroups (Group g1 , Group g2 ){
165+ private void checkGroups (Group g1 , Group g2 ) {
184166 String pdbId1 = g1 .getChain ().getStructure ().getPDBCode ();
185167 String pdbId2 = g1 .getChain ().getStructure ().getPDBCode ();
186168 assertEquals (pdbId1 , pdbId2 );
@@ -192,8 +174,8 @@ private void checkGroups(Group g1, Group g2){
192174 assertEquals (g1 .has3D (), g2 .has3D ());
193175
194176 assertEquals (g1 .hasAltLoc (), g2 .hasAltLoc ());
195- assertEquals (pdbId1 + ":" + g1 + " - " + pdbId2 + ":" + g2 , g1 .getAltLocs ().size (), g2 .getAltLocs ().size ());
196- assertEquals (pdbId1 + ":" + g1 + " - " + pdbId2 + ":" + g2 , g1 .getAtoms ().size (), g2 .getAtoms ().size ());
177+ assertEquals (pdbId1 + ":" + g1 + " - " + pdbId2 + ":" + g2 , g1 .getAltLocs ().size (), g2 .getAltLocs ().size ());
178+ assertEquals (pdbId1 + ":" + g1 + " - " + pdbId2 + ":" + g2 , g1 .getAtoms ().size (), g2 .getAtoms ().size ());
197179
198180 if (g1 .has3D ()) {
199181 Atom a1 = g1 .getAtom (0 );
@@ -204,7 +186,7 @@ private void checkGroups(Group g1, Group g2){
204186 if (a2 == null ) {
205187 fail ("could not get atom for group " + g2 );
206188 }
207- assertEquals (a1 .getX (),a2 .getX (), 0.0001 );
189+ assertEquals (a1 .getX (), a2 .getX (), 0.0001 );
208190 assertEquals (a1 .getOccupancy (), a2 .getOccupancy (), 0.0001 );
209191 assertEquals (a1 .getTempFactor (), a2 .getTempFactor (), 0.0001 );
210192 assertEquals (a1 .getName (), a2 .getName ());
@@ -218,12 +200,12 @@ private void checkNMR(Structure s) {
218200 List <Chain > model0 = s .getModel (0 );
219201
220202 // compare with all others
221- for (int i = 1 ; i < models ; i ++){
203+ for (int i = 1 ; i < models ; i ++) {
222204 List <Chain > modelX = s .getModel (i );
223205 assertEquals (model0 .size (), modelX .size ());
224206
225207 // compare lengths:
226- for (int j = 0 ; j < model0 .size (); j ++){
208+ for (int j = 0 ; j < model0 .size (); j ++) {
227209 Chain c1 = model0 .get (j );
228210 Chain cx = modelX .get (j );
229211 assertEquals (c1 .getAtomLength (), cx .getAtomLength ());
@@ -234,142 +216,4 @@ private void checkNMR(Structure s) {
234216 }
235217 }
236218 }
237-
238- /**
239- * There were issues when parsing files in parallel using SimpleDateFormat. The culprit are not the actual files,
240- * but rather the parallel execution. No problems spotted by this test, run for whole archive is now flawless after
241- * dropping SimpleDateFormat for date parsing.
242- */
243- @ Test
244- @ Ignore ("ignored for now as Bcif file source may change - currently using local files" )
245- public void testFailingEntries () {
246- Pattern .compile (", " ).splitAsStream ("4he8, 1z4u, 4fp1, 1blc, 4cit, 2y2y, 4exq, 2n0f, 2d9o, 2v16, 1kqv, " +
247- "1bwo, 2k2g, 1qhd, 5mhj, 2dn3, 5pq8, 5cay, 6ms1, 2vhu, 2gi0, 3swe, 3daz, 5yel, 2pxp, 4uis, 3cs1, 3in5," +
248- " 1sl3, 4hjc, 3hj2, 5kpi, 1gyq, 1yq8, 4yqz, 1ox3, 2pls, 1vne, 4q02, 1dtt, 1jau, 5h3b, 5sxk, 4el7, 5q7w," +
249- " 4zuz, 1n6i, 1dhg, 3dhe, 2gpo, 5if7, 5ld8, 1jhz, 4fr3, 1r6u, 3hdl, 5fse, 1iho, 1t10, 2oc6, 3czx, 3b3o," +
250- " 5i6w, 2ecv, 4l2x, 441d, 2i0x, 1xq4, 3tbb, 4mmz, 1qew, 6i16, 1t8d, 5w7r, 6gm1, 1s7u, 2qp3, 1cf3, 4myb," +
251- " 1omh, 1zog, 2b68, 1nqb, 1t7k" )
252- .parallel ()
253- .forEach (pdbId -> {
254- System .out .println (pdbId );
255- Structure cif = loadLocalCif (pdbId );
256- assertNumberFormat (cif );
257- Structure bcif = loadLocalBcif (pdbId );
258- assertNumberFormat (bcif );
259- });
260- }
261-
262- private Structure loadLocalCif (String pdbId ) {
263- try {
264- String middle = pdbId .substring (1 , 3 );
265- return CifFileConverter .convert (CifReader .readText (new GZIPInputStream (Files .newInputStream (Paths .get ("/var/pdb/" + middle + "/" + pdbId + ".cif.gz" )))));
266- } catch (IOException e ) {
267- throw new UncheckedIOException (e );
268- }
269- }
270-
271- private Structure loadLocalBcif (String pdbId ) {
272- try {
273- String middle = pdbId .substring (1 , 3 );
274- return CifFileConverter .convert (CifReader .readBinary (Files .newInputStream (Paths .get ("/var/bcif/" + middle + "/" + pdbId + ".bcif" ))));
275- } catch (IOException e ) {
276- throw new UncheckedIOException (e );
277- }
278- }
279-
280- private void assertNumberFormat (Structure structure ) {
281- PDBHeader header = structure .getPDBHeader ();
282- assertNotNull (header );
283- Date relDate = header .getRelDate ();
284- assertNotNull (relDate );
285- Date modDate = header .getModDate ();
286- assertNotNull (modDate );
287- }
288-
289- enum Source {
290- BCIF , // binary cif using ciftools
291- CIF , // mmCIF using ciftools
292- MMCIF , // mmCIF using BioJava impl
293- MMTF // mmtf using BioJava impl
294- }
295-
296- /**
297- * Performance diary;
298- *
299- * 05/01/19 - ciftools v0.3.0, parallel, bcif, non-gzipped, 12 worker threads
300- * BCIF: 918 s for 151079 structures, 6073 µs per structure, failed for 0 entries
301- * CIF: 2502 s for 151079 structures, 16562 µs per structure, failed for 0 entries
302- * MMTF: 207 s for 151579 structures, 1367 µs per structure, failed for 0 entries
303- *
304- * 05/02/19 - ciftools v0.3.0, parallel, bcif, non-gzipped for bcif / gzipped for cif, 12 worker threads
305- * BCIF: 35 s for 5000 structures, 6228 µs per structure, failed for 0 entries
306- * CIF: 132 s for 5000 structures, 23902 µs per structure, failed for 0 entries
307- * MMCIF: 310 s for 5000 structures, 52329 µs per structure, failed for 0 entries
308- * MMTF: 8 s for 5000 structures, 1729 µs per structure, failed for 0 entries
309- *
310- * CIF parsing using David's tokenizer approach takes roughly half the time. Binary parsing using ciftools takes
311- * roughly double the time compared to MMTF. However, MMTF provides almost no metadata.
312- *
313- * @throws IOException propagated
314- */
315- @ Test
316- @ Ignore ("ignore long-running test, do run to track performance" )
317- public void parseEntireArchive () throws IOException {
318- // TODO create an objective test case - all/none gzipped, mmtf parses almost no annotation data
319- AtomicInteger counter = new AtomicInteger (0 );
320- long start = System .nanoTime ();
321- int chunkSize = 250 ;
322- List <String > failed = Collections .synchronizedList (new ArrayList <>());
323-
324- Source source = Source .MMTF ;
325- Path archivePath = null ;
326- switch (source ) {
327- // change to your own paths
328- case BCIF : archivePath = Paths .get ("/var/bcif/" ); break ;
329- case CIF : case MMCIF : archivePath = Paths .get ("/var/pdb/" ); break ;
330- case MMTF : archivePath = Paths .get ("/var/mmtf/" ); break ;
331- }
332-
333- Files .walk (archivePath )
334- .parallel ()
335- // either process whole archive or limit to some number of structures
336- // .limit(5000)
337- .filter (path -> !Files .isDirectory (path ))
338- .forEach (path -> {
339- int count = counter .incrementAndGet ();
340- if (count % chunkSize == 0 ) {
341- long end_chunk = System .nanoTime ();
342- System .out .println ("[" + count + "] @ " + (((end_chunk - start ) /
343- 1_000 / count ) + " µs per structure" ));
344- }
345-
346- try {
347- // the work is to obtain the CifFile instance and convert into a BioJava structure
348- switch (source ) {
349- case BCIF :
350- CifFileConverter .convert (CifReader .readBinary (Files .newInputStream (path ))); break ;
351- case CIF :
352- CifFileConverter .convert (CifReader .readText (new GZIPInputStream (Files .newInputStream (path )))); break ;
353- case MMCIF :
354- new MMCIFFileReader ().getStructure (new GZIPInputStream (Files .newInputStream (path ))); break ;
355- case MMTF :
356- GZIPInputStream inputStream = new GZIPInputStream (Files .newInputStream (path ));
357- new MMTFFileReader ().getStructure (inputStream );
358- // impl does not close stream
359- inputStream .close ();
360- break ;
361- }
362- } catch (Exception e ) {
363- System .err .println ("failed for " + path .toFile ().getAbsolutePath ());
364- e .printStackTrace ();
365- failed .add (path .toFile ().getName ().split ("\\ ." )[0 ]);
366- }
367- });
368-
369- long end = System .nanoTime ();
370- System .out .println ((end - start ) / 1_000_000_000 + " s" );
371- System .out .println ("processed " + counter .get () + " files" );
372- System .out .println ("failed for " + failed .size () + " structures" );
373- System .out .println ("failed ids: " + failed );
374- }
375219}
0 commit comments