From a6723d42a22166de949bed6a6c4f4f1f29a87ee7 Mon Sep 17 00:00:00 2001 From: josemduarte Date: Fri, 28 Feb 2025 11:12:52 -0800 Subject: [PATCH 1/6] Deal with some missing fields in IHM --- .../structure/io/cif/CifStructureConsumerImpl.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java index c610c05f2b..c9ab103037 100644 --- a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java +++ b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java @@ -1297,15 +1297,18 @@ public void finish() { SeqMisMatch seqMisMatch = new SeqMisMatchImpl(); seqMisMatch.setDetails(structRefSeqDif.getDetails().get(rowIndex)); - String insCode = structRefSeqDif.getPdbxPdbInsCode().get(rowIndex); + String insCode = null; + if (structRefSeqDif.getPdbxPdbInsCode().isDefined()) { + insCode = structRefSeqDif.getPdbxPdbInsCode().get(rowIndex); if ("?".equals(insCode)) { - insCode = null; + insCode = null; + } } seqMisMatch.setInsCode(insCode); seqMisMatch.setOrigGroup(structRefSeqDif.getDbMonId().get(rowIndex)); seqMisMatch.setPdbGroup(structRefSeqDif.getMonId().get(rowIndex)); - seqMisMatch.setPdbResNum(structRefSeqDif.getPdbxAuthSeqNum().get(rowIndex)); - seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex)); + seqMisMatch.setPdbResNum(structRefSeqDif.getPdbxAuthSeqNum().isDefined()? structRefSeqDif.getPdbxAuthSeqNum().get(rowIndex):null); + seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().isDefined()? structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex):null); seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex)); String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex); From 8286260589643583c6d631a989b708b4d8c7b956 Mon Sep 17 00:00:00 2001 From: josemduarte Date: Fri, 28 Feb 2025 11:15:03 -0800 Subject: [PATCH 2/6] Dealing with struct_ref_seq_dif strand_id by referencing through struct_ref_seq and struct_ref --- .../io/cif/CifStructureConsumerImpl.java | 81 ++++++++++++++----- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java index c9ab103037..23ab32f3bf 100644 --- a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java +++ b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java @@ -144,6 +144,7 @@ public class CifStructureConsumerImpl implements CifStructureConsumer { private StructNcsOper structNcsOper; private PdbxStructOperList structOpers; private StructRef structRef; + private StructRefSeq structRefSeq; private StructRefSeqDif structRefSeqDif; private StructSiteGen structSiteGen; @@ -689,7 +690,7 @@ public void consumeEntitySrcSyn(PdbxEntitySrcSyn entitySrcSyn) { @Override public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) { for (int rowIndex = 0; rowIndex < entityPolySeq.getRowCount(); rowIndex++) { - Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex)); + Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex), true); // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08 @@ -728,19 +729,27 @@ public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) { } } - private Chain getEntityChain(String entityId) { + /** + * Get a chain from the temporary list holding them. If createNewChains is true, a new chain + * will be added is none are found with the given entityId + * @param entityId the entity id + * @param createNewChains whether to add new chains if not found or not. If false, null will be returned if chain not found + * @return the chain + */ + private Chain getEntityChain(String entityId, boolean createNewChains) { for (Chain chain : entityChains) { if (chain.getId().equals(entityId)) { return chain; } } - - // does not exist yet, so create... - Chain chain = new ChainImpl(); - chain.setId(entityId); - entityChains.add(chain); - - return chain; + if (createNewChains) { + // does not exist yet, so create... + Chain chain = new ChainImpl(); + chain.setId(entityId); + entityChains.add(chain); + return chain; + } + return null; } @Override @@ -967,6 +976,7 @@ public void consumeStructRef(StructRef structRef) { @Override public void consumeStructRefSeq(StructRefSeq structRefSeq) { + this.structRefSeq = structRefSeq; for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) { String refId = structRefSeq.getRefId().get(rowIndex); @@ -1173,7 +1183,7 @@ public void finish() { String entityId = structAsym.getEntityId().get(rowIndex); logger.debug("Entity {} matches asym_id: {}", entityId, id); - Chain chain = getEntityChain(entityId); + Chain chain = getEntityChain(entityId, true); Chain seqRes = (Chain) chain.clone(); // to solve issue #160 (e.g. 3u7t) seqRes = removeSeqResHeterogeneity(seqRes); @@ -1292,7 +1302,8 @@ public void finish() { setStructNcsOps(); setCrystallographicInfoMetadata(); - Map> misMatchMap = new HashMap<>(); + // entity id to list of SeqMisMatch + Map> misMatchMap = new HashMap<>(); for (int rowIndex = 0; rowIndex < structRefSeqDif.getRowCount(); rowIndex++) { SeqMisMatch seqMisMatch = new SeqMisMatchImpl(); seqMisMatch.setDetails(structRefSeqDif.getDetails().get(rowIndex)); @@ -1311,20 +1322,54 @@ public void finish() { seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().isDefined()? structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex):null); seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex)); - String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex); - List seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>()); - seqMisMatches.add(seqMisMatch); + // try to trace the reference entity_id to struct_ref_seq -> struct_ref + String alignId = findRefIdInStructRefSeq(structRefSeqDif.getAlignId().get(rowIndex)); + if (alignId!=null) { + int entityId = findEntityIdInStructRef(alignId); + if (entityId > 0) { + List seqMisMatches = misMatchMap.computeIfAbsent(entityId, k -> new ArrayList<>()); + seqMisMatches.add(seqMisMatch); + } + } } - for (String chainId : misMatchMap.keySet()){ - Chain chain = structure.getPolyChainByPDB(chainId); + for (int entityId : misMatchMap.keySet()){ + Chain chain = getEntityChain(String.valueOf(entityId), false); if (chain == null) { - logger.warn("Could not set mismatches for chain with author id {}", chainId); + logger.warn("Could not set mismatches for chain with entity id {}", entityId); continue; } + chain.setSeqMisMatches(misMatchMap.get(entityId)); + } + } - chain.setSeqMisMatches(misMatchMap.get(chainId)); + private String findRefIdInStructRefSeq(String alignId) { + for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) { + String currentAlignId = structRefSeq.getAlignId().get(rowIndex); + if (alignId.equals(currentAlignId)) { + return structRefSeq.getRefId().isDefined()? structRefSeq.getRefId().get(rowIndex) : null; + } + } + return null; + } + + private int findEntityIdInStructRef(String refId) { + String entityIdStr = null; + for (int rowIndex = 0; rowIndex < structRef.getRowCount(); rowIndex++) { + String currentId = structRef.getId().get(rowIndex); + if (refId.equals(currentId)) { + entityIdStr = structRef.getEntityId().isDefined()? structRef.getEntityId().get(rowIndex) : null; + } + } + int entityId = -1; + if (entityIdStr != null) { + try { + entityId = Integer.parseInt(entityIdStr); + } catch (NumberFormatException e) { + logger.warn("Could not parse entity id from '{}'", entityIdStr); + } } + return entityId; } private String getEntityType(String entityId) { From 58136fe36f4c0c25d3227580e9e3d0b7f869dc7d Mon Sep 17 00:00:00 2001 From: josemduarte Date: Fri, 28 Feb 2025 13:13:47 -0800 Subject: [PATCH 3/6] Logging --- .../biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java index 23ab32f3bf..1de00f7e4c 100644 --- a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java +++ b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java @@ -1171,7 +1171,7 @@ public void finish() { currentModel.add(currentChain); } } else if (!params.isHeaderOnly()) { - logger.warn("current chain is null at end of document."); + logger.warn("No chains were instantiated after parsing the whole CIF document. This could be due to the atom_site category being absent"); } allModels.add(currentModel); From fdf98644c046416531cb53da0f67e8b7fc6a44cf Mon Sep 17 00:00:00 2001 From: josemduarte Date: Fri, 28 Feb 2025 13:22:51 -0800 Subject: [PATCH 4/6] Revert "Dealing with struct_ref_seq_dif strand_id by referencing through struct_ref_seq and struct_ref" This reverts commit 8286260589643583c6d631a989b708b4d8c7b956. --- .../io/cif/CifStructureConsumerImpl.java | 81 +++++-------------- 1 file changed, 18 insertions(+), 63 deletions(-) diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java index 1de00f7e4c..b90b085609 100644 --- a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java +++ b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java @@ -144,7 +144,6 @@ public class CifStructureConsumerImpl implements CifStructureConsumer { private StructNcsOper structNcsOper; private PdbxStructOperList structOpers; private StructRef structRef; - private StructRefSeq structRefSeq; private StructRefSeqDif structRefSeqDif; private StructSiteGen structSiteGen; @@ -690,7 +689,7 @@ public void consumeEntitySrcSyn(PdbxEntitySrcSyn entitySrcSyn) { @Override public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) { for (int rowIndex = 0; rowIndex < entityPolySeq.getRowCount(); rowIndex++) { - Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex), true); + Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex)); // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08 @@ -729,27 +728,19 @@ public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) { } } - /** - * Get a chain from the temporary list holding them. If createNewChains is true, a new chain - * will be added is none are found with the given entityId - * @param entityId the entity id - * @param createNewChains whether to add new chains if not found or not. If false, null will be returned if chain not found - * @return the chain - */ - private Chain getEntityChain(String entityId, boolean createNewChains) { + private Chain getEntityChain(String entityId) { for (Chain chain : entityChains) { if (chain.getId().equals(entityId)) { return chain; } } - if (createNewChains) { - // does not exist yet, so create... - Chain chain = new ChainImpl(); - chain.setId(entityId); - entityChains.add(chain); - return chain; - } - return null; + + // does not exist yet, so create... + Chain chain = new ChainImpl(); + chain.setId(entityId); + entityChains.add(chain); + + return chain; } @Override @@ -976,7 +967,6 @@ public void consumeStructRef(StructRef structRef) { @Override public void consumeStructRefSeq(StructRefSeq structRefSeq) { - this.structRefSeq = structRefSeq; for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) { String refId = structRefSeq.getRefId().get(rowIndex); @@ -1183,7 +1173,7 @@ public void finish() { String entityId = structAsym.getEntityId().get(rowIndex); logger.debug("Entity {} matches asym_id: {}", entityId, id); - Chain chain = getEntityChain(entityId, true); + Chain chain = getEntityChain(entityId); Chain seqRes = (Chain) chain.clone(); // to solve issue #160 (e.g. 3u7t) seqRes = removeSeqResHeterogeneity(seqRes); @@ -1302,8 +1292,7 @@ public void finish() { setStructNcsOps(); setCrystallographicInfoMetadata(); - // entity id to list of SeqMisMatch - Map> misMatchMap = new HashMap<>(); + Map> misMatchMap = new HashMap<>(); for (int rowIndex = 0; rowIndex < structRefSeqDif.getRowCount(); rowIndex++) { SeqMisMatch seqMisMatch = new SeqMisMatchImpl(); seqMisMatch.setDetails(structRefSeqDif.getDetails().get(rowIndex)); @@ -1322,54 +1311,20 @@ public void finish() { seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().isDefined()? structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex):null); seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex)); - // try to trace the reference entity_id to struct_ref_seq -> struct_ref - String alignId = findRefIdInStructRefSeq(structRefSeqDif.getAlignId().get(rowIndex)); - if (alignId!=null) { - int entityId = findEntityIdInStructRef(alignId); - if (entityId > 0) { - List seqMisMatches = misMatchMap.computeIfAbsent(entityId, k -> new ArrayList<>()); - seqMisMatches.add(seqMisMatch); - } - } + String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex); + List seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>()); + seqMisMatches.add(seqMisMatch); } - for (int entityId : misMatchMap.keySet()){ - Chain chain = getEntityChain(String.valueOf(entityId), false); + for (String chainId : misMatchMap.keySet()){ + Chain chain = structure.getPolyChainByPDB(chainId); if (chain == null) { - logger.warn("Could not set mismatches for chain with entity id {}", entityId); + logger.warn("Could not set mismatches for chain with author id {}", chainId); continue; } - chain.setSeqMisMatches(misMatchMap.get(entityId)); - } - } - private String findRefIdInStructRefSeq(String alignId) { - for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) { - String currentAlignId = structRefSeq.getAlignId().get(rowIndex); - if (alignId.equals(currentAlignId)) { - return structRefSeq.getRefId().isDefined()? structRefSeq.getRefId().get(rowIndex) : null; - } - } - return null; - } - - private int findEntityIdInStructRef(String refId) { - String entityIdStr = null; - for (int rowIndex = 0; rowIndex < structRef.getRowCount(); rowIndex++) { - String currentId = structRef.getId().get(rowIndex); - if (refId.equals(currentId)) { - entityIdStr = structRef.getEntityId().isDefined()? structRef.getEntityId().get(rowIndex) : null; - } - } - int entityId = -1; - if (entityIdStr != null) { - try { - entityId = Integer.parseInt(entityIdStr); - } catch (NumberFormatException e) { - logger.warn("Could not parse entity id from '{}'", entityIdStr); - } + chain.setSeqMisMatches(misMatchMap.get(chainId)); } - return entityId; } private String getEntityType(String entityId) { From 2841770e794679818e8eab458b387d157b97f9ec Mon Sep 17 00:00:00 2001 From: josemduarte Date: Fri, 28 Feb 2025 13:24:40 -0800 Subject: [PATCH 5/6] Deal with missing strand_id --- .../nbio/structure/io/cif/CifStructureConsumerImpl.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java index b90b085609..8451d7b8fb 100644 --- a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java +++ b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java @@ -1311,7 +1311,8 @@ public void finish() { seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().isDefined()? structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex):null); seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex)); - String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex); + String strandId = structRefSeqDif.getPdbxPdbStrandId().isDefined()? structRefSeqDif.getPdbxPdbStrandId().get(rowIndex) : null; + if (strandId == null) continue; List seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>()); seqMisMatches.add(seqMisMatch); } From 48cb462e137a17d5161f390bbf265a11bc4f48ee Mon Sep 17 00:00:00 2001 From: Jose Manuel Duarte Date: Tue, 4 Mar 2025 13:52:57 -0800 Subject: [PATCH 6/6] Update biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java Co-authored-by: Sebastian Bittrich --- .../nbio/structure/io/cif/CifStructureConsumerImpl.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java index 8451d7b8fb..67514edd84 100644 --- a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java +++ b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java @@ -1311,8 +1311,8 @@ public void finish() { seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().isDefined()? structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex):null); seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex)); - String strandId = structRefSeqDif.getPdbxPdbStrandId().isDefined()? structRefSeqDif.getPdbxPdbStrandId().get(rowIndex) : null; - if (strandId == null) continue; + if (!structRefSeqDif.getPdbxPdbStrandId().isDefined()) continue; + String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex); List seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>()); seqMisMatches.add(seqMisMatch); }