Skip to content

Commit 5a4a68c

Browse files
authored
Merge pull request #774 from sbliven/fix703
Fix #703: Recover from empty structure files in PDB_CACHE_DIR
2 parents b62c2e1 + 08ab3e1 commit 5a4a68c

File tree

12 files changed

+413
-50
lines changed

12 files changed

+413
-50
lines changed

biojava-structure/src/main/java/org/biojava/nbio/structure/StructureIO.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ private static void checkInitAtomCache() {
119119
public static void setAtomCache(AtomCache c){
120120
cache = c;
121121
}
122+
123+
public static AtomCache getAtomCache() {
124+
checkInitAtomCache();
125+
return cache;
126+
}
122127

123128
/**
124129
* Returns the first biologicalAssembly that is available for a protein structure. For more documentation on quaternary structures see:

biojava-structure/src/main/java/org/biojava/nbio/structure/cath/CathInstallation.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,7 @@ private void parseCathDomall(BufferedReader bufferedReader) throws IOException{
636636

637637
protected void downloadFileFromRemote(URL remoteURL, File localFile) throws IOException{
638638
// System.out.println("downloading " + remoteURL + " to: " + localFile);
639+
LOGGER.info("Downloading file {} to local file {}", remoteURL, localFile);
639640

640641
long timeS = System.currentTimeMillis();
641642
File tempFile = File.createTempFile(FileDownloadUtils.getFilePrefix(localFile), "."+ FileDownloadUtils.getFileExtension(localFile));
@@ -665,7 +666,7 @@ protected void downloadFileFromRemote(URL remoteURL, File localFile) throws IOEx
665666
disp = disp / 1024.0;
666667
}
667668
long timeE = System.currentTimeMillis();
668-
LOGGER.info("Downloaded file {} ({}) to local file {} in {} sec.", remoteURL, String.format("%.1f",disp) + unit, localFile, (timeE - timeS)/1000);
669+
LOGGER.info("Downloaded {} in {} sec. to {}", String.format("%.1f",disp) + unit, (timeE - timeS)/1000, localFile);
669670
}
670671

671672
private boolean domainDescriptionFileAvailable(){

biojava-structure/src/main/java/org/biojava/nbio/structure/io/LocalPDBDirectory.java

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import java.io.IOException;
3636
import java.io.InputStream;
3737
import java.net.URL;
38+
import java.nio.file.Files;
3839
import java.text.ParseException;
3940
import java.text.SimpleDateFormat;
4041
import java.util.*;
@@ -127,6 +128,9 @@ public static enum FetchBehavior {
127128

128129
protected static final String lineSplit = System.getProperty("file.separator");
129130

131+
/** Minimum size for a valid structure file (CIF or PDB), in bytes */
132+
public static final long MIN_PDB_FILE_SIZE = 40; // Empty gzip files are 20bytes. Add a few more for buffer.
133+
130134
private File path;
131135
private List<String> extensions;
132136

@@ -402,8 +406,9 @@ public void prefetchStructure(String pdbId) throws IOException {
402406
* Attempts to delete all versions of a structure from the local directory.
403407
* @param pdbId
404408
* @return True if one or more files were deleted
409+
* @throws IOException if the file cannot be deleted
405410
*/
406-
public boolean deleteStructure(String pdbId){
411+
public boolean deleteStructure(String pdbId) throws IOException{
407412
boolean deleted = false;
408413
// Force getLocalFile to check in obsolete locations
409414
ObsoleteBehavior obsolete = getObsoleteBehavior();
@@ -421,7 +426,7 @@ public boolean deleteStructure(String pdbId){
421426
// delete file
422427
boolean success = existing.delete();
423428
if(success) {
424-
logger.info("Deleting "+existing.getAbsolutePath());
429+
logger.debug("Deleting "+existing.getAbsolutePath());
425430
}
426431
deleted = deleted || success;
427432

@@ -430,7 +435,7 @@ public boolean deleteStructure(String pdbId){
430435
if(parent != null) {
431436
success = parent.delete();
432437
if(success) {
433-
logger.info("Deleting "+parent.getAbsolutePath());
438+
logger.debug("Deleting "+parent.getAbsolutePath());
434439
}
435440
}
436441

@@ -660,8 +665,9 @@ protected File getDir(String pdbId, boolean obsolete) {
660665
* Searches for previously downloaded files
661666
* @param pdbId
662667
* @return A file pointing to the existing file, or null if not found
668+
* @throws IOException If the file exists but is empty and can't be deleted
663669
*/
664-
public File getLocalFile(String pdbId) {
670+
public File getLocalFile(String pdbId) throws IOException {
665671

666672
// Search for existing files
667673

@@ -687,6 +693,11 @@ public File getLocalFile(String pdbId) {
687693
for(String ex : getExtensions() ){
688694
File f = new File(searchdir,prefix + pdbId.toLowerCase() + ex) ;
689695
if ( f.exists()) {
696+
// delete files that are too short to have contents
697+
if( f.length() < MIN_PDB_FILE_SIZE ) {
698+
Files.delete(f.toPath());
699+
return null;
700+
}
690701
return f;
691702
}
692703
}
@@ -697,9 +708,11 @@ public File getLocalFile(String pdbId) {
697708
}
698709

699710
protected boolean checkFileExists(String pdbId){
700-
File path = getLocalFile(pdbId);
701-
if ( path != null)
702-
return true;
711+
try {
712+
File path = getLocalFile(pdbId);
713+
if ( path != null)
714+
return true;
715+
} catch(IOException e) {}
703716
return false;
704717
}
705718

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/ChemCompGroupFactory.java

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,8 @@ public static ChemComp getChemComp(String recordName){
6868
* again. Note that this change can have unexpected behavior of
6969
* code executed afterwards.
7070
* <p>
71-
* Changing the provider does not reset the cache, so Chemical
72-
* Component definitions already downloaded from previous providers
73-
* will be used. To reset the cache see {@link #getCache()).
71+
* Changing the provider also resets the cache, so any groups
72+
* previously accessed will be reread or re-downloaded.
7473
*
7574
* @param provider
7675
*/
@@ -84,6 +83,15 @@ public static void setChemCompProvider(ChemCompProvider provider) {
8483
public static ChemCompProvider getChemCompProvider(){
8584
return chemCompProvider;
8685
}
86+
87+
/**
88+
* Force the in-memory cache to be reset.
89+
*
90+
* Note that the ChemCompProvider may have additional memory or disk caches that need to be cleared too.
91+
*/
92+
public static void clearCache() {
93+
cache.clear();
94+
}
8795

8896
public static Group getGroupFromChemCompDictionary(String recordName) {
8997

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/DownloadChemCompProvider.java

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import org.biojava.nbio.core.util.InputStreamProvider;
4343
import org.biojava.nbio.structure.align.util.HTTPConnectionTools;
4444
import org.biojava.nbio.structure.align.util.UserConfiguration;
45+
import org.biojava.nbio.structure.io.LocalPDBDirectory;
4546
import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
4647
import org.slf4j.Logger;
4748
import org.slf4j.LoggerFactory;
@@ -85,32 +86,37 @@ public class DownloadChemCompProvider implements ChemCompProvider {
8586
protectedIDs.add("AUX");
8687
protectedIDs.add("NUL");
8788
}
89+
90+
private static ChemCompProvider fallback = null; // Fallback provider if the download fails
8891

8992
/** by default we will download only some of the files. User has to request that all files should be downloaded...
9093
*
9194
*/
9295
boolean downloadAll = false;
9396

9497
public DownloadChemCompProvider(){
95-
logger.debug("Initialising DownloadChemCompProvider");
96-
97-
// note that path is static, so this is just to make sure that all non-static methods will have path initialised
98-
initPath();
98+
this(null);
9999
}
100100

101101
public DownloadChemCompProvider(String cacheFilePath){
102102
logger.debug("Initialising DownloadChemCompProvider");
103103

104104
// note that path is static, so this is just to make sure that all non-static methods will have path initialised
105-
path = new File(cacheFilePath);
105+
if(cacheFilePath != null) {
106+
path = new File(cacheFilePath);
107+
}
106108
}
107109

108-
private static void initPath(){
109-
110+
/**
111+
* Get this provider's cache path
112+
* @return
113+
*/
114+
public static File getPath(){
110115
if (path==null) {
111116
UserConfiguration config = new UserConfiguration();
112117
path = new File(config.getCacheFilePath());
113118
}
119+
return path;
114120
}
115121

116122
/**
@@ -127,7 +133,7 @@ public void checkDoFirstInstall(){
127133

128134
// this makes sure there is a file separator between every component,
129135
// if path has a trailing file separator or not, it will work for both cases
130-
File dir = new File(path, CHEM_COMP_CACHE_DIRECTORY);
136+
File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
131137
File f = new File(dir, "components.cif.gz");
132138

133139
if ( ! f.exists()) {
@@ -161,7 +167,7 @@ private void split() throws IOException {
161167

162168
logger.info("Installing individual chem comp files ...");
163169

164-
File dir = new File(path, CHEM_COMP_CACHE_DIRECTORY);
170+
File dir = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
165171
File f = new File(dir, "components.cif.gz");
166172

167173

@@ -212,7 +218,7 @@ private void split() throws IOException {
212218
*/
213219
private void writeID(String contents, String currentID) throws IOException{
214220

215-
String localName = DownloadChemCompProvider.getLocalFileName(currentID);
221+
String localName = getLocalFileName(currentID);
216222

217223
try ( PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(localName))) ) {
218224

@@ -272,7 +278,10 @@ public ChemComp getChemComp(String recordName) {
272278

273279
ChemComp chemComp = dict.getChemComp(recordName);
274280

275-
return chemComp;
281+
// May be null if the file was corrupt. Fall back on ReducedChemCompProvider in that case
282+
if(chemComp != null) {
283+
return chemComp;
284+
}
276285

277286
} catch (IOException e) {
278287

@@ -296,9 +305,12 @@ public ChemComp getChemComp(String recordName) {
296305

297306
// see https://github.com/biojava/biojava/issues/315
298307
// probably a network error happened. Try to use the ReducedChemCOmpProvider
299-
ReducedChemCompProvider reduced = new ReducedChemCompProvider();
308+
if( fallback == null) {
309+
fallback = new ReducedChemCompProvider();
310+
}
300311

301-
return reduced.getChemComp(recordName);
312+
logger.warn("Falling back to ReducedChemCompProvider for {}. This could indicate a network error.", recordName);
313+
return fallback.getChemComp(recordName);
302314

303315
}
304316

@@ -313,16 +325,15 @@ public static String getLocalFileName(String recordName){
313325
recordName = "_" + recordName;
314326
}
315327

316-
initPath();
317-
318-
File f = new File(path, CHEM_COMP_CACHE_DIRECTORY);
328+
File f = new File(getPath(), CHEM_COMP_CACHE_DIRECTORY);
319329
if (! f.exists()){
320330
logger.info("Creating directory " + f);
321331

322332
boolean success = f.mkdir();
323333
// we've checked in initPath that path is writable, so there's no need to check if it succeeds
324334
// in the unlikely case that in the meantime it isn't writable at least we log an error
325-
if (!success) logger.error("Directory {} could not be created",f);
335+
if (!success)
336+
logger.error("Directory {} could not be created",f);
326337

327338
}
328339

@@ -337,6 +348,14 @@ private static boolean fileExists(String recordName){
337348

338349
File f = new File(fileName);
339350

351+
// delete files that are too short to have contents
352+
if( f.length() < LocalPDBDirectory.MIN_PDB_FILE_SIZE ) {
353+
// Delete defensively.
354+
// Note that if delete is unsuccessful, we re-download the file anyways
355+
f.delete();
356+
return false;
357+
}
358+
340359
return f.exists();
341360

342361
}

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifParser.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,9 @@ public void parse(BufferedReader buf)
212212

213213
// the first line is a data_PDBCODE line, test if this looks like a mmcif file
214214
line = buf.readLine();
215+
while( line != null && (line.isEmpty() || line.startsWith(COMMENT_CHAR))) {
216+
line = buf.readLine();
217+
}
215218
if (line == null || !line.startsWith(MMCIF_TOP_HEADER)){
216219
logger.error("This does not look like a valid mmCIF file! The first line should start with 'data_', but is: '" + line+"'");
217220
triggerDocumentEnd();

biojava-structure/src/main/java/org/biojava/nbio/structure/io/util/FileDownloadUtils.java

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,6 @@
2121
*/
2222
package org.biojava.nbio.structure.io.util;
2323

24-
import org.slf4j.Logger;
25-
import org.slf4j.LoggerFactory;
26-
2724
import java.io.File;
2825
import java.io.FileInputStream;
2926
import java.io.FileOutputStream;
@@ -36,6 +33,15 @@
3633
import java.nio.channels.Channels;
3734
import java.nio.channels.FileChannel;
3835
import java.nio.channels.ReadableByteChannel;
36+
import java.nio.file.FileVisitResult;
37+
import java.nio.file.Files;
38+
import java.nio.file.Path;
39+
import java.nio.file.Paths;
40+
import java.nio.file.SimpleFileVisitor;
41+
import java.nio.file.attribute.BasicFileAttributes;
42+
43+
import org.slf4j.Logger;
44+
import org.slf4j.LoggerFactory;
3945

4046
public class FileDownloadUtils {
4147

@@ -240,6 +246,41 @@ public static URLConnection prepareURLConnection(String url, int timeout) throws
240246
connection.setConnectTimeout(timeout);
241247
return connection;
242248
}
249+
250+
/**
251+
* Recursively delete a folder & contents
252+
*
253+
* @param dir directory to delete
254+
*/
255+
public static void deleteDirectory(Path dir) throws IOException {
256+
if(dir == null || !Files.exists(dir))
257+
return;
258+
Files.walkFileTree(dir, new SimpleFileVisitor<Path>() {
259+
@Override
260+
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
261+
Files.delete(file);
262+
return FileVisitResult.CONTINUE;
263+
}
264+
265+
@Override
266+
public FileVisitResult postVisitDirectory(Path dir, IOException e) throws IOException {
267+
if (e != null) {
268+
throw e;
269+
}
270+
Files.delete(dir);
271+
return FileVisitResult.CONTINUE;
272+
}
273+
});
274+
}
275+
/**
276+
* Recursively delete a folder & contents
277+
*
278+
* @param dir directory to delete
279+
*/
280+
public static void deleteDirectory(String dir) throws IOException {
281+
deleteDirectory(Paths.get(dir));
282+
}
283+
243284

244285
public static void main(String[] args) {
245286
String url;

biojava-structure/src/test/java/org/biojava/nbio/structure/TestAtomCache.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ public class TestAtomCache {
4646
private AtomCache cache;
4747

4848
@Before
49-
public void setUp() {
49+
public void setUp() throws IOException {
5050
cache = new AtomCache();
5151

5252
// Delete files which were cached in previous tests

biojava-structure/src/test/java/org/biojava/nbio/structure/TestExperimentalTechniques.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
public class TestExperimentalTechniques {
3232

3333
@Test
34-
public void test4LNC() throws IOException, StructureException {
34+
public void test6F2Q() throws IOException, StructureException {
3535

3636
// a multiple experimental techniques PDB entry (X-RAY + NEUTRON DIFFRACTION)
3737

@@ -40,9 +40,9 @@ public void test4LNC() throws IOException, StructureException {
4040
StructureIO.setAtomCache(cache);
4141

4242
cache.setUseMmCif(false);
43-
Structure sPdb = StructureIO.getStructure("4LNC");
43+
Structure sPdb = StructureIO.getStructure("6F2Q");
4444
cache.setUseMmCif(true);
45-
Structure sCif = StructureIO.getStructure("4LNC");
45+
Structure sCif = StructureIO.getStructure("6F2Q");
4646

4747
comparePdbToCif(sPdb, sCif);
4848

0 commit comments

Comments
 (0)