diff --git a/src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java b/src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java
index c08c87ff18..342b0cd950 100644
--- a/src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java
+++ b/src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java
@@ -38,6 +38,7 @@
import java.nio.file.Path;
import java.util.Iterator;
+
/**
* @author Daniel Gomez-Sanchez (magicDGS)
*/
@@ -146,6 +147,55 @@ protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile,
}
}
+ /** Do some basic checking to make sure the fasta and the index match.
+ *
+ * checks that the length of the fasta file is at least as long as the index proclaims
+ * and that beyond the last position references in the index there is only one line followed by whitespaces
+ *
+ * @param fastaFile Path to fasta file
+ * @param fastaSequenceIndexes index file to check against the fasta file.
+ *
+ * @throws IOException in case of io-error when reading fastaFile
+ */
+ public void sanityCheckFastaAgainstIndex(final Path fastaFile,
+ final FastaSequenceIndex fastaSequenceIndexes) throws IOException {
+
+ final FastaSequenceIndexEntry lastSequenceIndex = fastaSequenceIndexes.getIndexEntry(fastaSequenceIndexes.getLastSequence());
+
+ final long lastSequenceLength = lastSequenceIndex.getSize();
+ final long lastSequenceStart = lastSequenceIndex.getLocation();
+ final long lastSequenceEnd = lastSequenceStart + lastSequenceIndex.getOffset(lastSequenceLength);
+
+ final long fastaLength = Files.size(fastaFile);
+
+ if (lastSequenceEnd > fastaLength) {
+ throw new IllegalArgumentException("The fasta file (%s) is shorter (%d) than its index (%s) claims (%d). Please reindex the fasta.".formatted(fastaFile.toUri().toString(),fastaLength, index.toString(), lastSequenceEnd));
+ }
+ // if fasta file is longer than this, make sure that the remainder is just whitespaces
+ long posOfInterest = lastSequenceEnd + lastSequenceIndex.getTerminatorLength();
+ if (posOfInterest < fastaLength) {
+
+ final ByteBuffer channelBuffer = ByteBuffer.allocate(100);
+
+ while (posOfInterest < fastaLength) {
+ channelBuffer.clear();
+ readFromPosition(channelBuffer, posOfInterest);
+ for (int i = 0; i < channelBuffer.position(); i++) {
+ byte b = channelBuffer.get(i);
+ if (!Character.isWhitespace((char) b)) {
+ throw new IllegalArgumentException(
+ ("The fasta file %s is too long (relative to the index). In particular has a non-whitespace " +
+ "character (%c) at a position (%d) beyond the last base (%d), according to its index (%s)." +
+ " Please reindex the fasta.")
+ .formatted(fastaFile.toUri().toString(), (char) b, posOfInterest + i, lastSequenceEnd,index.toString()));
+ }
+ }
+ posOfInterest += channelBuffer.limit();
+ }
+ }
+ }
+
+
public FastaSequenceIndex getIndex() {
return index;
}
@@ -210,11 +260,12 @@ public ReferenceSequence getSubsequenceAt( String contig, long start, long stop
final int bytesPerLine = indexEntry.getBytesPerLine();
final int terminatorLength = bytesPerLine - basesPerLine;
- long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine;
+ long startOffset = indexEntry.getOffset(start);
+
// Cast to long so the second argument cannot overflow a signed integer.
final long minBufferSize = Math.min((long) Defaults.NON_ZERO_BUFFER_SIZE, (long)(length / basesPerLine + 2) * (long)bytesPerLine);
if (minBufferSize > Integer.MAX_VALUE) throw new SAMException("Buffer is too large: " + minBufferSize);
-
+
// Allocate a buffer for reading in sequence data.
final ByteBuffer channelBuffer = ByteBuffer.allocate((int)minBufferSize);
diff --git a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java
index 735ab6347f..175c3cb691 100644
--- a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java
+++ b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java
@@ -36,11 +36,7 @@
import java.io.PrintStream;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-import java.util.Objects;
-import java.util.Scanner;
+import java.util.*;
import java.util.regex.MatchResult;
/**
@@ -50,26 +46,31 @@ public class FastaSequenceIndex implements Iterable {
/**
* Store the entries. Use a LinkedHashMap for consistent iteration in insertion order.
*/
- private final Map sequenceEntries = new LinkedHashMap();
+ private final Map sequenceEntries = new LinkedHashMap<>();
+ /** this member contains the name of the last index entry that was created during initialization.
+ * Afterwards, subsequent operations (such as rename) might change the names and thus this member becomes unreliable.
+ */
+ private final String lastSequence;
+
/**
* Build a sequence index from the specified file.
* @param indexFile File to open.
- * @throws FileNotFoundException if the index file cannot be found.
+ * @throws SAMException if the index file cannot be found.
*/
- public FastaSequenceIndex( File indexFile ) {
+ public FastaSequenceIndex( File indexFile ) throws SAMException {
this(IOUtil.toPath(indexFile));
}
/**
* Build a sequence index from the specified file.
* @param indexFile File to open.
- * @throws FileNotFoundException if the index file cannot be found.
+ * @throws SAMException if the index file cannot be found.
*/
- public FastaSequenceIndex( Path indexFile ) {
+ public FastaSequenceIndex( Path indexFile ) throws SAMException {
IOUtil.assertFileIsReadable(indexFile);
try (InputStream in = Files.newInputStream(indexFile)) {
- parseIndexFile(in);
+ this.lastSequence = parseIndexFile(in);
} catch (IOException e) {
throw new SAMException("Fasta index file could not be opened: " + indexFile, e);
}
@@ -80,13 +81,15 @@ public FastaSequenceIndex( Path indexFile ) {
* @param in InputStream to read from.
*/
public FastaSequenceIndex(InputStream in) {
- parseIndexFile(in);
+ lastSequence=parseIndexFile(in);
}
/**
- * Empty, protected constructor for unit testing.
+ * Empty, protected constructor for unit testing. Use with care, lastSequence will be incorrect.
*/
- protected FastaSequenceIndex() {}
+ protected FastaSequenceIndex() {
+ lastSequence = "";
+ }
/**
* Add a new index entry to the list. Protected for unit testing.
@@ -144,10 +147,13 @@ public int hashCode() {
/**
* Parse the contents of an index file, caching the results internally.
* @param in InputStream to parse.
+ *
+ * @return the name of the last contig (for the sanity-check)
*/
- private void parseIndexFile(InputStream in) {
+ private String parseIndexFile(InputStream in) {
try (Scanner scanner = new Scanner(in)) {
int sequenceIndex = 0;
+ String lastContig = null;
while( scanner.hasNext() ) {
// Tokenize and validate the index line.
String result = scanner.findInLine("(.+)\\t+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
@@ -170,7 +176,9 @@ private void parseIndexFile(InputStream in) {
contig = SAMSequenceRecord.truncateSequenceName(contig);
// Build sequence structure
add(new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine, sequenceIndex++) );
+ lastContig=contig;
}
+ return lastContig;
}
}
@@ -233,4 +241,13 @@ public Iterator iterator() {
public int size() {
return sequenceEntries.size();
}
+
+
+ /**
+ * @return The name of the last entry that was added when parsing the index file. Only guarranteed to be correct just
+ * after initialization. Protected for access from AbstractIndexedFastaSequenceFile.
+ */
+ protected String getLastSequence() {
+ return this.lastSequence;
+ }
}
diff --git a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexEntry.java b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexEntry.java
index f8bbf4e157..31666798d6 100644
--- a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexEntry.java
+++ b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexEntry.java
@@ -112,6 +112,26 @@ public int getSequenceIndex() {
return sequenceIndex;
}
+ /** Return the offset to pos as determined by the number of bases and bytes per line
+ *
+ * @param pos the (1-based) position in the contig that is requested
+ * @return the offset (0-based) from 'location' where pos is located in the file.
+ */
+ public long getOffset(long pos) {
+ final int basesPerLine = this.getBasesPerLine();
+ final int bytesPerLine = this.getBytesPerLine();
+
+ return ((pos - 1) / basesPerLine) * bytesPerLine + ((pos - 1) % basesPerLine);
+ }
+
+ /** get the terminator length from the bytes per line and the bases per line
+ *
+ * @return the length of the terminator of each line (1 or 2)
+ */
+ public int getTerminatorLength() {
+ return bytesPerLine - basesPerLine;
+ }
+
/**
* For debugging. Emit the contents of each contig line.
* @return A string representation of the contig line.
diff --git a/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java b/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java
index 26a1f352db..bbe7cb7f9b 100644
--- a/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java
+++ b/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java
@@ -81,6 +81,7 @@ public IndexedFastaSequenceFile(final Path path, final FastaSequenceIndex index)
throw new SAMException("Indexed block-compressed FASTA file cannot be handled: " + path);
}
this.channel = Files.newByteChannel(path);
+ sanityCheckFastaAgainstIndex(path,index);
} catch (IOException e) {
throw new SAMException("FASTA file should be readable but is not: " + path, e);
}
diff --git a/src/test/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFileTest.java b/src/test/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFileTest.java
index 2e189d90f4..4d5b5ba6c8 100644
--- a/src/test/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFileTest.java
+++ b/src/test/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFileTest.java
@@ -60,6 +60,11 @@ public class AbstractIndexedFastaSequenceFileTest extends HtsjdkTest {
private static final File SEQUENCE_FILE_BGZ = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.fasta.gz");
private static final File SEQUENCE_FILE_GZI = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.fasta.gz.gzi");
private static final File SEQUENCE_FILE_NODICT = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.nodict.fasta");
+ private static final Path HEADER_WITH_WHITESPACE = new File(TEST_DATA_DIR,"header_with_white_space.fasta").toPath();
+ private static final Path HEADER_WITH_EXTRA_WHITESPACE = new File(TEST_DATA_DIR,"header_with_extra_white_space.fasta").toPath();
+ private static final Path CRLF = new File(TEST_DATA_DIR,"crlf.fasta").toPath();
+ private static final Path HEADER_WITH_WHITESPACE_index = AbstractIndexedFastaSequenceFile.findFastaIndex(HEADER_WITH_WHITESPACE);
+ private static final Path CRLF_index = AbstractIndexedFastaSequenceFile.findFastaIndex(CRLF);
private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT";
private final String extendedBasesOfChrM = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT" +
@@ -68,6 +73,15 @@ public class AbstractIndexedFastaSequenceFileTest extends HtsjdkTest {
private final String lastBasesOfChr20 = "ttgtctgatgctcatattgt";
private final int CHR20_LENGTH = 1000000;
+ @DataProvider(name="mismatched_indexes")
+ public Object[][] provideMismatchedIndexes() {
+ return new Object[][]{
+ new Object[]{HEADER_WITH_WHITESPACE,CRLF_index},
+ new Object[]{CRLF,HEADER_WITH_WHITESPACE_index}
+ };
+ }
+
+
@DataProvider(name="homosapiens")
public Object[][] provideSequenceFile() throws FileNotFoundException {
return new Object[][] { new Object[]
@@ -470,4 +484,24 @@ public void testBlockCompressedIndexedFastaSequenceFileFromNio() throws IOExcept
}
}
+ @Test(expectedExceptions = IllegalArgumentException.class,
+ dataProvider = "mismatched_indexes")
+ public void testWrongIndex(Path fasta, Path index) {
+ try (IndexedFastaSequenceFile indexedFastaSequenceFile = new IndexedFastaSequenceFile(fasta, new FastaSequenceIndex(index))) {
+
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Test
+ public void testExtraWhitespace() {
+ try (IndexedFastaSequenceFile indexedFastaSequenceFile =
+ new IndexedFastaSequenceFile(HEADER_WITH_EXTRA_WHITESPACE,
+ new FastaSequenceIndex(HEADER_WITH_WHITESPACE_index))) {
+
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
}
diff --git a/src/test/resources/htsjdk/samtools/reference/header_with_extra_white_space.fasta b/src/test/resources/htsjdk/samtools/reference/header_with_extra_white_space.fasta
new file mode 100644
index 0000000000..eee6fc1e80
--- /dev/null
+++ b/src/test/resources/htsjdk/samtools/reference/header_with_extra_white_space.fasta
@@ -0,0 +1,7 @@
+>a test white space
+ACTG
+>b test whitespace
+ACTG
+
+
+
\ No newline at end of file