diff --git a/src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java b/src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java index c08c87ff18..342b0cd950 100644 --- a/src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java +++ b/src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java @@ -38,6 +38,7 @@ import java.nio.file.Path; import java.util.Iterator; + /** * @author Daniel Gomez-Sanchez (magicDGS) */ @@ -146,6 +147,55 @@ protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile, } } + /** Do some basic checking to make sure the fasta and the index match. + *

+ * checks that the length of the fasta file is at least as long as the index proclaims + * and that beyond the last position references in the index there is only one line followed by whitespaces + * + * @param fastaFile Path to fasta file + * @param fastaSequenceIndexes index file to check against the fasta file. + * + * @throws IOException in case of io-error when reading fastaFile + */ + public void sanityCheckFastaAgainstIndex(final Path fastaFile, + final FastaSequenceIndex fastaSequenceIndexes) throws IOException { + + final FastaSequenceIndexEntry lastSequenceIndex = fastaSequenceIndexes.getIndexEntry(fastaSequenceIndexes.getLastSequence()); + + final long lastSequenceLength = lastSequenceIndex.getSize(); + final long lastSequenceStart = lastSequenceIndex.getLocation(); + final long lastSequenceEnd = lastSequenceStart + lastSequenceIndex.getOffset(lastSequenceLength); + + final long fastaLength = Files.size(fastaFile); + + if (lastSequenceEnd > fastaLength) { + throw new IllegalArgumentException("The fasta file (%s) is shorter (%d) than its index (%s) claims (%d). Please reindex the fasta.".formatted(fastaFile.toUri().toString(),fastaLength, index.toString(), lastSequenceEnd)); + } + // if fasta file is longer than this, make sure that the remainder is just whitespaces + long posOfInterest = lastSequenceEnd + lastSequenceIndex.getTerminatorLength(); + if (posOfInterest < fastaLength) { + + final ByteBuffer channelBuffer = ByteBuffer.allocate(100); + + while (posOfInterest < fastaLength) { + channelBuffer.clear(); + readFromPosition(channelBuffer, posOfInterest); + for (int i = 0; i < channelBuffer.position(); i++) { + byte b = channelBuffer.get(i); + if (!Character.isWhitespace((char) b)) { + throw new IllegalArgumentException( + ("The fasta file %s is too long (relative to the index). In particular has a non-whitespace " + + "character (%c) at a position (%d) beyond the last base (%d), according to its index (%s)." + + " Please reindex the fasta.") + .formatted(fastaFile.toUri().toString(), (char) b, posOfInterest + i, lastSequenceEnd,index.toString())); + } + } + posOfInterest += channelBuffer.limit(); + } + } + } + + public FastaSequenceIndex getIndex() { return index; } @@ -210,11 +260,12 @@ public ReferenceSequence getSubsequenceAt( String contig, long start, long stop final int bytesPerLine = indexEntry.getBytesPerLine(); final int terminatorLength = bytesPerLine - basesPerLine; - long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine; + long startOffset = indexEntry.getOffset(start); + // Cast to long so the second argument cannot overflow a signed integer. final long minBufferSize = Math.min((long) Defaults.NON_ZERO_BUFFER_SIZE, (long)(length / basesPerLine + 2) * (long)bytesPerLine); if (minBufferSize > Integer.MAX_VALUE) throw new SAMException("Buffer is too large: " + minBufferSize); - + // Allocate a buffer for reading in sequence data. final ByteBuffer channelBuffer = ByteBuffer.allocate((int)minBufferSize); diff --git a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java index 735ab6347f..175c3cb691 100644 --- a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java +++ b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java @@ -36,11 +36,7 @@ import java.io.PrintStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.Objects; -import java.util.Scanner; +import java.util.*; import java.util.regex.MatchResult; /** @@ -50,26 +46,31 @@ public class FastaSequenceIndex implements Iterable { /** * Store the entries. Use a LinkedHashMap for consistent iteration in insertion order. */ - private final Map sequenceEntries = new LinkedHashMap(); + private final Map sequenceEntries = new LinkedHashMap<>(); + /** this member contains the name of the last index entry that was created during initialization. + * Afterwards, subsequent operations (such as rename) might change the names and thus this member becomes unreliable. + */ + private final String lastSequence; + /** * Build a sequence index from the specified file. * @param indexFile File to open. - * @throws FileNotFoundException if the index file cannot be found. + * @throws SAMException if the index file cannot be found. */ - public FastaSequenceIndex( File indexFile ) { + public FastaSequenceIndex( File indexFile ) throws SAMException { this(IOUtil.toPath(indexFile)); } /** * Build a sequence index from the specified file. * @param indexFile File to open. - * @throws FileNotFoundException if the index file cannot be found. + * @throws SAMException if the index file cannot be found. */ - public FastaSequenceIndex( Path indexFile ) { + public FastaSequenceIndex( Path indexFile ) throws SAMException { IOUtil.assertFileIsReadable(indexFile); try (InputStream in = Files.newInputStream(indexFile)) { - parseIndexFile(in); + this.lastSequence = parseIndexFile(in); } catch (IOException e) { throw new SAMException("Fasta index file could not be opened: " + indexFile, e); } @@ -80,13 +81,15 @@ public FastaSequenceIndex( Path indexFile ) { * @param in InputStream to read from. */ public FastaSequenceIndex(InputStream in) { - parseIndexFile(in); + lastSequence=parseIndexFile(in); } /** - * Empty, protected constructor for unit testing. + * Empty, protected constructor for unit testing. Use with care, lastSequence will be incorrect. */ - protected FastaSequenceIndex() {} + protected FastaSequenceIndex() { + lastSequence = ""; + } /** * Add a new index entry to the list. Protected for unit testing. @@ -144,10 +147,13 @@ public int hashCode() { /** * Parse the contents of an index file, caching the results internally. * @param in InputStream to parse. + * + * @return the name of the last contig (for the sanity-check) */ - private void parseIndexFile(InputStream in) { + private String parseIndexFile(InputStream in) { try (Scanner scanner = new Scanner(in)) { int sequenceIndex = 0; + String lastContig = null; while( scanner.hasNext() ) { // Tokenize and validate the index line. String result = scanner.findInLine("(.+)\\t+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)"); @@ -170,7 +176,9 @@ private void parseIndexFile(InputStream in) { contig = SAMSequenceRecord.truncateSequenceName(contig); // Build sequence structure add(new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine, sequenceIndex++) ); + lastContig=contig; } + return lastContig; } } @@ -233,4 +241,13 @@ public Iterator iterator() { public int size() { return sequenceEntries.size(); } + + + /** + * @return The name of the last entry that was added when parsing the index file. Only guarranteed to be correct just + * after initialization. Protected for access from AbstractIndexedFastaSequenceFile. + */ + protected String getLastSequence() { + return this.lastSequence; + } } diff --git a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexEntry.java b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexEntry.java index f8bbf4e157..31666798d6 100644 --- a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexEntry.java +++ b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexEntry.java @@ -112,6 +112,26 @@ public int getSequenceIndex() { return sequenceIndex; } + /** Return the offset to pos as determined by the number of bases and bytes per line + * + * @param pos the (1-based) position in the contig that is requested + * @return the offset (0-based) from 'location' where pos is located in the file. + */ + public long getOffset(long pos) { + final int basesPerLine = this.getBasesPerLine(); + final int bytesPerLine = this.getBytesPerLine(); + + return ((pos - 1) / basesPerLine) * bytesPerLine + ((pos - 1) % basesPerLine); + } + + /** get the terminator length from the bytes per line and the bases per line + * + * @return the length of the terminator of each line (1 or 2) + */ + public int getTerminatorLength() { + return bytesPerLine - basesPerLine; + } + /** * For debugging. Emit the contents of each contig line. * @return A string representation of the contig line. diff --git a/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java b/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java index 26a1f352db..bbe7cb7f9b 100644 --- a/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java +++ b/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java @@ -81,6 +81,7 @@ public IndexedFastaSequenceFile(final Path path, final FastaSequenceIndex index) throw new SAMException("Indexed block-compressed FASTA file cannot be handled: " + path); } this.channel = Files.newByteChannel(path); + sanityCheckFastaAgainstIndex(path,index); } catch (IOException e) { throw new SAMException("FASTA file should be readable but is not: " + path, e); } diff --git a/src/test/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFileTest.java b/src/test/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFileTest.java index 2e189d90f4..4d5b5ba6c8 100644 --- a/src/test/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFileTest.java +++ b/src/test/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFileTest.java @@ -60,6 +60,11 @@ public class AbstractIndexedFastaSequenceFileTest extends HtsjdkTest { private static final File SEQUENCE_FILE_BGZ = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.fasta.gz"); private static final File SEQUENCE_FILE_GZI = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.fasta.gz.gzi"); private static final File SEQUENCE_FILE_NODICT = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.nodict.fasta"); + private static final Path HEADER_WITH_WHITESPACE = new File(TEST_DATA_DIR,"header_with_white_space.fasta").toPath(); + private static final Path HEADER_WITH_EXTRA_WHITESPACE = new File(TEST_DATA_DIR,"header_with_extra_white_space.fasta").toPath(); + private static final Path CRLF = new File(TEST_DATA_DIR,"crlf.fasta").toPath(); + private static final Path HEADER_WITH_WHITESPACE_index = AbstractIndexedFastaSequenceFile.findFastaIndex(HEADER_WITH_WHITESPACE); + private static final Path CRLF_index = AbstractIndexedFastaSequenceFile.findFastaIndex(CRLF); private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT"; private final String extendedBasesOfChrM = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT" + @@ -68,6 +73,15 @@ public class AbstractIndexedFastaSequenceFileTest extends HtsjdkTest { private final String lastBasesOfChr20 = "ttgtctgatgctcatattgt"; private final int CHR20_LENGTH = 1000000; + @DataProvider(name="mismatched_indexes") + public Object[][] provideMismatchedIndexes() { + return new Object[][]{ + new Object[]{HEADER_WITH_WHITESPACE,CRLF_index}, + new Object[]{CRLF,HEADER_WITH_WHITESPACE_index} + }; + } + + @DataProvider(name="homosapiens") public Object[][] provideSequenceFile() throws FileNotFoundException { return new Object[][] { new Object[] @@ -470,4 +484,24 @@ public void testBlockCompressedIndexedFastaSequenceFileFromNio() throws IOExcept } } + @Test(expectedExceptions = IllegalArgumentException.class, + dataProvider = "mismatched_indexes") + public void testWrongIndex(Path fasta, Path index) { + try (IndexedFastaSequenceFile indexedFastaSequenceFile = new IndexedFastaSequenceFile(fasta, new FastaSequenceIndex(index))) { + + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Test + public void testExtraWhitespace() { + try (IndexedFastaSequenceFile indexedFastaSequenceFile = + new IndexedFastaSequenceFile(HEADER_WITH_EXTRA_WHITESPACE, + new FastaSequenceIndex(HEADER_WITH_WHITESPACE_index))) { + + } catch (IOException e) { + throw new RuntimeException(e); + } + } } diff --git a/src/test/resources/htsjdk/samtools/reference/header_with_extra_white_space.fasta b/src/test/resources/htsjdk/samtools/reference/header_with_extra_white_space.fasta new file mode 100644 index 0000000000..eee6fc1e80 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/reference/header_with_extra_white_space.fasta @@ -0,0 +1,7 @@ +>a test white space +ACTG +>b test whitespace +ACTG + + + \ No newline at end of file