Skip to content

Commit 6bba4e1

Browse files
author
Sebastian Bittrich
committed
cleanup and updates to ciftools 0.4.1
- isolates consumer and converter functions - omits highly specific performance test
1 parent 1755a4c commit 6bba4e1

File tree

8 files changed

+174
-233
lines changed

8 files changed

+174
-233
lines changed

biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/io/cif/CifFileConsumerIntegrationTest.java

Lines changed: 20 additions & 176 deletions
Original file line numberDiff line numberDiff line change
@@ -11,32 +11,14 @@
1111
import org.biojava.nbio.structure.io.CifFileReader;
1212
import org.biojava.nbio.structure.io.FileParsingParameters;
1313
import org.biojava.nbio.structure.io.LocalPDBDirectory;
14-
import org.biojava.nbio.structure.io.MMCIFFileReader;
15-
import org.biojava.nbio.structure.io.MMTFFileReader;
1614
import org.biojava.nbio.structure.io.PDBFileParser;
17-
import org.biojava.nbio.structure.io.cif.CifFileConverter;
18-
import org.junit.Ignore;
1915
import org.junit.Test;
20-
import org.rcsb.cif.CifReader;
2116

2217
import java.io.IOException;
2318
import java.io.InputStream;
24-
import java.io.UncheckedIOException;
25-
import java.nio.file.Files;
26-
import java.nio.file.Path;
27-
import java.nio.file.Paths;
28-
import java.util.ArrayList;
29-
import java.util.Collections;
30-
import java.util.Date;
3119
import java.util.List;
32-
import java.util.concurrent.atomic.AtomicInteger;
33-
import java.util.regex.Pattern;
34-
import java.util.zip.GZIPInputStream;
3520

36-
import static org.junit.Assert.assertNotNull;
37-
import static org.junit.Assert.assertEquals;
38-
import static org.junit.Assert.assertTrue;
39-
import static org.junit.Assert.fail;
21+
import static org.junit.Assert.*;
4022

4123
public class CifFileConsumerIntegrationTest {
4224
private static boolean headerOnly;
@@ -70,29 +52,29 @@ public void testLoadHeaderOnlyBinary() throws IOException {
7052

7153
private void doTestLoad() throws IOException {
7254
// test a simple protein
73-
comparePDB2cif("5pti","A");
55+
comparePDB2cif("5pti", "A");
7456

7557
// test a protein with modified residues
76-
comparePDB2cif("1a4w","L");
77-
comparePDB2cif("1a4w","H");
78-
comparePDB2cif("1a4w","I");
58+
comparePDB2cif("1a4w", "L");
59+
comparePDB2cif("1a4w", "H");
60+
comparePDB2cif("1a4w", "I");
7961

8062
//non-standard encoded amino acid
81-
comparePDB2cif("1fdo","A");
63+
comparePDB2cif("1fdo", "A");
8264

8365
// test a DNA binding protein
84-
comparePDB2cif("1j59","A");
85-
comparePDB2cif("1j59","E");
66+
comparePDB2cif("1j59", "A");
67+
comparePDB2cif("1j59", "E");
8668

8769
// test a NMR protein
88-
comparePDB2cif("2kc9","A");
70+
comparePDB2cif("2kc9", "A");
8971
}
9072

9173
private void comparePDB2cif(String id, String chainId) throws IOException {
9274
String fileName = binary ? "/" + id + ".bcif" : "/" + id + ".cif";
9375
System.out.println(fileName);
9476
InputStream inStream = getClass().getResourceAsStream(fileName);
95-
assertNotNull("Could not find file " + fileName + ". Config problem?" , inStream);
77+
assertNotNull("Could not find file " + fileName + ". Config problem?", inStream);
9678

9779
LocalPDBDirectory reader = binary ? new BcifFileReader() : new CifFileReader();
9880

@@ -120,7 +102,7 @@ private void comparePDB2cif(String id, String chainId) throws IOException {
120102
pdbStructure.isNmr(),
121103
cifStructure.isNmr());
122104

123-
if (pdbStructure.isNmr()){
105+
if (pdbStructure.isNmr()) {
124106
assertEquals(id + ": the nr of NMR models is not the same!",
125107
pdbStructure.nrModels(),
126108
pdbStructure.nrModels());
@@ -143,7 +125,7 @@ private void comparePDB2cif(String id, String chainId) throws IOException {
143125
a_cif.getAtomGroups(GroupType.AMINOACID).size());
144126

145127
// actually this check not necessarily works, since there can be waters in PDB that we don;t deal with yet in cif...
146-
for (int i = 0 ; i < a_pdb.getAtomGroups(GroupType.AMINOACID).size(); i++){
128+
for (int i = 0; i < a_pdb.getAtomGroups(GroupType.AMINOACID).size(); i++) {
147129
Group gp = a_pdb.getAtomGroups(GroupType.AMINOACID).get(i);
148130
List<Group> cifGroups = a_cif.getAtomGroups(GroupType.AMINOACID);
149131
Group gc = cifGroups.get(i);
@@ -155,8 +137,8 @@ private void comparePDB2cif(String id, String chainId) throws IOException {
155137

156138
assertEquals("the sequences obtained from PDB and mmCif don't match!", pdb_seq, cif_seq);
157139

158-
List<DBRef> pdb_dbrefs= pdbStructure.getDBRefs();
159-
List<DBRef> cif_dbrefs= cifStructure.getDBRefs();
140+
List<DBRef> pdb_dbrefs = pdbStructure.getDBRefs();
141+
List<DBRef> cif_dbrefs = cifStructure.getDBRefs();
160142

161143
assertEquals("nr of DBrefs found does not match!", pdb_dbrefs.size(), cif_dbrefs.size());
162144

@@ -180,7 +162,7 @@ private void comparePDB2cif(String id, String chainId) throws IOException {
180162
h2.toPDB().toUpperCase());
181163
}
182164

183-
private void checkGroups(Group g1, Group g2){
165+
private void checkGroups(Group g1, Group g2) {
184166
String pdbId1 = g1.getChain().getStructure().getPDBCode();
185167
String pdbId2 = g1.getChain().getStructure().getPDBCode();
186168
assertEquals(pdbId1, pdbId2);
@@ -192,8 +174,8 @@ private void checkGroups(Group g1, Group g2){
192174
assertEquals(g1.has3D(), g2.has3D());
193175

194176
assertEquals(g1.hasAltLoc(), g2.hasAltLoc());
195-
assertEquals(pdbId1 + ":" + g1 + " - " + pdbId2 + ":"+ g2, g1.getAltLocs().size(), g2.getAltLocs().size());
196-
assertEquals(pdbId1 + ":" + g1 + " - " + pdbId2 + ":"+ g2, g1.getAtoms().size(), g2.getAtoms().size());
177+
assertEquals(pdbId1 + ":" + g1 + " - " + pdbId2 + ":" + g2, g1.getAltLocs().size(), g2.getAltLocs().size());
178+
assertEquals(pdbId1 + ":" + g1 + " - " + pdbId2 + ":" + g2, g1.getAtoms().size(), g2.getAtoms().size());
197179

198180
if (g1.has3D()) {
199181
Atom a1 = g1.getAtom(0);
@@ -204,7 +186,7 @@ private void checkGroups(Group g1, Group g2){
204186
if (a2 == null) {
205187
fail("could not get atom for group " + g2);
206188
}
207-
assertEquals(a1.getX(),a2.getX(), 0.0001);
189+
assertEquals(a1.getX(), a2.getX(), 0.0001);
208190
assertEquals(a1.getOccupancy(), a2.getOccupancy(), 0.0001);
209191
assertEquals(a1.getTempFactor(), a2.getTempFactor(), 0.0001);
210192
assertEquals(a1.getName(), a2.getName());
@@ -218,12 +200,12 @@ private void checkNMR(Structure s) {
218200
List<Chain> model0 = s.getModel(0);
219201

220202
// compare with all others
221-
for (int i = 1 ; i < models; i++){
203+
for (int i = 1; i < models; i++) {
222204
List<Chain> modelX = s.getModel(i);
223205
assertEquals(model0.size(), modelX.size());
224206

225207
// compare lengths:
226-
for (int j = 0 ; j < model0.size(); j++){
208+
for (int j = 0; j < model0.size(); j++) {
227209
Chain c1 = model0.get(j);
228210
Chain cx = modelX.get(j);
229211
assertEquals(c1.getAtomLength(), cx.getAtomLength());
@@ -234,142 +216,4 @@ private void checkNMR(Structure s) {
234216
}
235217
}
236218
}
237-
238-
/**
239-
* There were issues when parsing files in parallel using SimpleDateFormat. The culprit are not the actual files,
240-
* but rather the parallel execution. No problems spotted by this test, run for whole archive is now flawless after
241-
* dropping SimpleDateFormat for date parsing.
242-
*/
243-
@Test
244-
@Ignore("ignored for now as Bcif file source may change - currently using local files")
245-
public void testFailingEntries() {
246-
Pattern.compile(", ").splitAsStream("4he8, 1z4u, 4fp1, 1blc, 4cit, 2y2y, 4exq, 2n0f, 2d9o, 2v16, 1kqv, " +
247-
"1bwo, 2k2g, 1qhd, 5mhj, 2dn3, 5pq8, 5cay, 6ms1, 2vhu, 2gi0, 3swe, 3daz, 5yel, 2pxp, 4uis, 3cs1, 3in5," +
248-
" 1sl3, 4hjc, 3hj2, 5kpi, 1gyq, 1yq8, 4yqz, 1ox3, 2pls, 1vne, 4q02, 1dtt, 1jau, 5h3b, 5sxk, 4el7, 5q7w," +
249-
" 4zuz, 1n6i, 1dhg, 3dhe, 2gpo, 5if7, 5ld8, 1jhz, 4fr3, 1r6u, 3hdl, 5fse, 1iho, 1t10, 2oc6, 3czx, 3b3o," +
250-
" 5i6w, 2ecv, 4l2x, 441d, 2i0x, 1xq4, 3tbb, 4mmz, 1qew, 6i16, 1t8d, 5w7r, 6gm1, 1s7u, 2qp3, 1cf3, 4myb," +
251-
" 1omh, 1zog, 2b68, 1nqb, 1t7k")
252-
.parallel()
253-
.forEach(pdbId -> {
254-
System.out.println(pdbId);
255-
Structure cif = loadLocalCif(pdbId);
256-
assertNumberFormat(cif);
257-
Structure bcif = loadLocalBcif(pdbId);
258-
assertNumberFormat(bcif);
259-
});
260-
}
261-
262-
private Structure loadLocalCif(String pdbId) {
263-
try {
264-
String middle = pdbId.substring(1, 3);
265-
return CifFileConverter.convert(CifReader.readText(new GZIPInputStream(Files.newInputStream(Paths.get("/var/pdb/" + middle + "/" + pdbId + ".cif.gz")))));
266-
} catch (IOException e) {
267-
throw new UncheckedIOException(e);
268-
}
269-
}
270-
271-
private Structure loadLocalBcif(String pdbId) {
272-
try {
273-
String middle = pdbId.substring(1, 3);
274-
return CifFileConverter.convert(CifReader.readBinary(Files.newInputStream(Paths.get("/var/bcif/" + middle + "/" + pdbId + ".bcif"))));
275-
} catch (IOException e) {
276-
throw new UncheckedIOException(e);
277-
}
278-
}
279-
280-
private void assertNumberFormat(Structure structure) {
281-
PDBHeader header = structure.getPDBHeader();
282-
assertNotNull(header);
283-
Date relDate = header.getRelDate();
284-
assertNotNull(relDate);
285-
Date modDate = header.getModDate();
286-
assertNotNull(modDate);
287-
}
288-
289-
enum Source {
290-
BCIF, // binary cif using ciftools
291-
CIF, // mmCIF using ciftools
292-
MMCIF, // mmCIF using BioJava impl
293-
MMTF // mmtf using BioJava impl
294-
}
295-
296-
/**
297-
* Performance diary;
298-
*
299-
* 05/01/19 - ciftools v0.3.0, parallel, bcif, non-gzipped, 12 worker threads
300-
* BCIF: 918 s for 151079 structures, 6073 µs per structure, failed for 0 entries
301-
* CIF: 2502 s for 151079 structures, 16562 µs per structure, failed for 0 entries
302-
* MMTF: 207 s for 151579 structures, 1367 µs per structure, failed for 0 entries
303-
*
304-
* 05/02/19 - ciftools v0.3.0, parallel, bcif, non-gzipped for bcif / gzipped for cif, 12 worker threads
305-
* BCIF: 35 s for 5000 structures, 6228 µs per structure, failed for 0 entries
306-
* CIF: 132 s for 5000 structures, 23902 µs per structure, failed for 0 entries
307-
* MMCIF: 310 s for 5000 structures, 52329 µs per structure, failed for 0 entries
308-
* MMTF: 8 s for 5000 structures, 1729 µs per structure, failed for 0 entries
309-
*
310-
* CIF parsing using David's tokenizer approach takes roughly half the time. Binary parsing using ciftools takes
311-
* roughly double the time compared to MMTF. However, MMTF provides almost no metadata.
312-
*
313-
* @throws IOException propagated
314-
*/
315-
@Test
316-
@Ignore("ignore long-running test, do run to track performance")
317-
public void parseEntireArchive() throws IOException {
318-
// TODO create an objective test case - all/none gzipped, mmtf parses almost no annotation data
319-
AtomicInteger counter = new AtomicInteger(0);
320-
long start = System.nanoTime();
321-
int chunkSize = 250;
322-
List<String> failed = Collections.synchronizedList(new ArrayList<>());
323-
324-
Source source = Source.MMTF;
325-
Path archivePath = null;
326-
switch (source) {
327-
// change to your own paths
328-
case BCIF: archivePath = Paths.get("/var/bcif/"); break;
329-
case CIF: case MMCIF: archivePath = Paths.get("/var/pdb/"); break;
330-
case MMTF: archivePath = Paths.get("/var/mmtf/"); break;
331-
}
332-
333-
Files.walk(archivePath)
334-
.parallel()
335-
// either process whole archive or limit to some number of structures
336-
// .limit(5000)
337-
.filter(path -> !Files.isDirectory(path))
338-
.forEach(path -> {
339-
int count = counter.incrementAndGet();
340-
if (count % chunkSize == 0) {
341-
long end_chunk = System.nanoTime();
342-
System.out.println("[" + count + "] @ " + (((end_chunk - start) /
343-
1_000 / count) + " µs per structure"));
344-
}
345-
346-
try {
347-
// the work is to obtain the CifFile instance and convert into a BioJava structure
348-
switch (source) {
349-
case BCIF:
350-
CifFileConverter.convert(CifReader.readBinary(Files.newInputStream(path))); break;
351-
case CIF:
352-
CifFileConverter.convert(CifReader.readText(new GZIPInputStream(Files.newInputStream(path)))); break;
353-
case MMCIF:
354-
new MMCIFFileReader().getStructure(new GZIPInputStream(Files.newInputStream(path))); break;
355-
case MMTF:
356-
GZIPInputStream inputStream = new GZIPInputStream(Files.newInputStream(path));
357-
new MMTFFileReader().getStructure(inputStream);
358-
// impl does not close stream
359-
inputStream.close();
360-
break;
361-
}
362-
} catch (Exception e) {
363-
System.err.println("failed for " + path.toFile().getAbsolutePath());
364-
e.printStackTrace();
365-
failed.add(path.toFile().getName().split("\\.")[0]);
366-
}
367-
});
368-
369-
long end = System.nanoTime();
370-
System.out.println((end - start) / 1_000_000_000 + " s");
371-
System.out.println("processed " + counter.get() + " files");
372-
System.out.println("failed for " + failed.size() + " structures");
373-
System.out.println("failed ids: " + failed);
374-
}
375219
}

biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/io/cif/CifFileSupplierIntegrationTest.java

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,11 @@
1414
import org.biojava.nbio.structure.StructureTools;
1515
import org.biojava.nbio.structure.io.cif.CifFileConverter;
1616
import org.junit.Test;
17-
import org.rcsb.cif.CifReader;
18-
import org.rcsb.cif.CifWriter;
1917

20-
import java.io.File;
21-
import java.io.FileWriter;
18+
import java.io.ByteArrayInputStream;
2219
import java.io.IOException;
20+
import java.io.InputStream;
2321
import java.net.URL;
24-
import java.nio.file.Files;
2522
import java.util.Arrays;
2623

2724
import static org.junit.Assert.assertEquals;
@@ -51,18 +48,11 @@ public void test1A2C() throws IOException {
5148
}
5249

5350
private static void testRoundTrip(String pdbId) throws IOException {
54-
Structure originalStruct = CifFileConverter.convert(CifReader.readText(new URL("https://files.rcsb.org/download/" + pdbId
55-
+ ".cif").openStream()));
51+
URL url = new URL("https://files.rcsb.org/download/" + pdbId + ".cif");
52+
Structure originalStruct = CifFileConverter.fromURL(url);
5653

57-
File outputFile = File.createTempFile("biojava_testing_", ".cif");
58-
outputFile.deleteOnExit();
59-
60-
FileWriter fw = new FileWriter(outputFile);
61-
String cif = CifWriter.composeText(CifFileConverter.convert(originalStruct));
62-
fw.write(cif);
63-
fw.close();
64-
65-
Structure readStruct = CifFileConverter.convert(CifReader.readText(Files.newInputStream(outputFile.toPath())));
54+
InputStream inputStream = new ByteArrayInputStream(CifFileConverter.toText(originalStruct).getBytes());
55+
Structure readStruct = CifFileConverter.fromInputStream(inputStream);
6656

6757
assertNotNull(readStruct);
6858
assertEquals(originalStruct.getChains().size(), readStruct.getChains().size());
@@ -121,9 +111,9 @@ private static void testRoundTrip(String pdbId) throws IOException {
121111
* can be written out correctly.
122112
*/
123113
@Test
124-
public void testBiounitWriting() {
114+
public void testBiounitWriting() throws IOException {
125115
Structure s = createDummyStructure();
126-
String mmcif = CifWriter.composeText(CifFileConverter.convert(s));
116+
String mmcif = CifFileConverter.toText(s);
127117
String[] lines = mmcif.split("\n");
128118
long atomLines = Arrays.stream(lines).filter(l -> l.startsWith("ATOM")).count();
129119
assertNotNull(mmcif);

biojava-structure/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
<dependency>
2222
<groupId>org.rcsb</groupId>
2323
<artifactId>ciftools-java</artifactId>
24-
<version>0.3.0</version>
24+
<version>0.4.1</version>
2525
</dependency>
2626
<dependency>
2727
<groupId>org.rcsb</groupId>

biojava-structure/src/main/java/org/biojava/nbio/structure/io/BcifFileReader.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import org.biojava.nbio.structure.Structure;
44
import org.biojava.nbio.structure.align.util.UserConfiguration;
55
import org.biojava.nbio.structure.io.cif.CifFileConverter;
6-
import org.rcsb.cif.CifReader;
76

87
import java.io.IOException;
98
import java.io.InputStream;
@@ -28,7 +27,7 @@ public BcifFileReader() {
2827
}
2928

3029
/**
31-
* Constructs a new B`FileReader, initializing the extensions member variable.
30+
* Constructs a new BcifFileReader, initializing the extensions member variable.
3231
* The path is initialized to the given path, both autoFetch and splitDir are initialized to false.
3332
*/
3433
public BcifFileReader(String path) {
@@ -39,7 +38,7 @@ public BcifFileReader(String path) {
3938

4039
@Override
4140
public Structure getStructure(InputStream inStream) throws IOException {
42-
return CifFileConverter.convert(CifReader.readBinary(inStream), getFileParsingParameters());
41+
return CifFileConverter.fromInputStream(inStream, getFileParsingParameters());
4342
}
4443

4544
@Override

biojava-structure/src/main/java/org/biojava/nbio/structure/io/CifFileReader.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import org.biojava.nbio.structure.Structure;
44
import org.biojava.nbio.structure.align.util.UserConfiguration;
55
import org.biojava.nbio.structure.io.cif.CifFileConverter;
6-
import org.rcsb.cif.CifReader;
76

87
import java.io.IOException;
98
import java.io.InputStream;
@@ -41,7 +40,7 @@ public CifFileReader(String path) {
4140

4241
@Override
4342
public Structure getStructure(InputStream inStream) throws IOException{
44-
return CifFileConverter.convert(CifReader.readText(inStream), getFileParsingParameters());
43+
return CifFileConverter.fromInputStream(inStream, getFileParsingParameters());
4544
}
4645

4746
@Override

0 commit comments

Comments
 (0)