diff --git a/src/main/org/bson/BSONDecoder.java b/src/main/org/bson/BSONDecoder.java index 78ef6c350e9..4b21d4c369b 100644 --- a/src/main/org/bson/BSONDecoder.java +++ b/src/main/org/bson/BSONDecoder.java @@ -5,8 +5,8 @@ import static org.bson.BSON.*; import java.io.*; +import java.lang.ref.*; -import org.bson.io.*; import org.bson.types.*; public class BSONDecoder { @@ -42,7 +42,7 @@ public int decode( InputStream in , BSONCallback callback ) return decode( new Input( in ) , callback ); } - public int decode( Input in , BSONCallback callback ) + int decode( Input in , BSONCallback callback ) throws IOException { if ( _in != null || _callback != null ) @@ -62,22 +62,21 @@ public int decode( Input in , BSONCallback callback ) int decode() throws IOException { - - final int start = _in._read; + // + // We already read four bytes for length + final int start = _in.getBytesRead() - 4; - final int len = _in.readInt(); - _callback.objectStart(); while ( decodeElement() ); _callback.objectDone(); - final int read = _in._read - start; + final int read = _in.getBytesRead() - start; - if ( read != len ){ + if ( read != _in._length ) { //throw new IllegalArgumentException( "bad data. lengths don't match " + read + " != " + len ); } - return len; + return _in._length; } boolean decodeElement() @@ -251,32 +250,154 @@ Object _readBasicObject() } class Input { - Input( InputStream in ){ + /** + * Maximum size of readahead. This ensures that we copy in memory at most + * readahead bytes if the buffer does not contain enough continuous bytes. + * Must be lower or equal than size of _charBuffer to prevent a buffer overflow. + */ + final private static int MAX_READAHEADSIZE = 512; + + Input( final InputStream in ) + throws IOException { _in = in; _read = 0; + // + // Limit Buffer to only read 4 bytes for the real length + _length = 4; + _length = readInt(); } - - int readInt() - throws IOException { - _read += 4; - return Bits.readInt( _in ); + /** + * Ensures that a continuous block of bytes is loaded to the buffer. Its responsibility to consume + * the complete block. + * + * @param blockSize + * @throws IOException + */ + void ensureContinuousBlock(int blockSize) + throws IOException { + // + // Enough bytes already loaded? + if(_o + blockSize <= _l) + return; + + final int remaining = _l - _o; + // + // Is buffer large enough for block? + if(blockSize < _random.length) { + // + // copy the rest in the buffer to the front + System.arraycopy(_random, _o, _random, 0, remaining); + } + else { + // + // Allocate a larger buffer + final byte largerBuffer[] = new byte[blockSize + MAX_READAHEADSIZE]; + // + // copy the rest of the old buffer to the front of the new + System.arraycopy(_random, _o, largerBuffer, 0, remaining); + // + // swap the buffers + _random = largerBuffer; + } + // + // Increase the numbers of bytes by all processed bytes (offset with current buffer) + // Buffer is now aligned with the front + _read += _o; + + _o = 0; + _l = remaining; + // + // Calculate possible readahead. It is not allowed to read beyond the end of the current object (_length) + final int bytesTillEnd = _length - _read - _l; + final int readahead = Math.min(Math.min(MAX_READAHEADSIZE, _random.length - remaining), bytesTillEnd); + + int wanted = Math.max(readahead, blockSize - remaining); + + while(wanted > 0 && _l < blockSize) { + // + // Read as much as we wanted at the end of the buffer + int rd = _in.read(_random, _l, wanted); + // + // EOS reached? + if(rd < 0) + break; + // + // Increase end and reduced wanted by bytes read from InputStream + _l = _l + rd; + wanted -=rd; + } + // + // Ups, we were not able to read enough bytes from stream + if(_l < blockSize) { + throw new RuntimeException("end of stream reached"); + } } - long readLong() + /** + * Reads an integer. + * + * @return + * @throws IOException + */ + final int readInt() throws IOException { - _read += 8; - return Bits.readLong( _in ); + // + // All integers are 4 bytes + ensureContinuousBlock(4); + // + // Code copied from java.io.Bits + return + ((_random[_o++] & 0xFF) << 0) + + ((_random[_o++] & 0xFF) << 8) + + ((_random[_o++] & 0xFF) << 16) + + ((_random[_o++]) << 24); } - + /** + * Reads a long. + * + * @return + * @throws IOException + */ + long readLong() + throws IOException { + // + // All longs are 8 bytes + ensureContinuousBlock(8); + // + // Code copied from java.io.Bits + return ((_random[_o++] & 0xFFL) << 0) + + ((_random[_o++] & 0xFFL) << 8) + + ((_random[_o++] & 0xFFL) << 16) + + ((_random[_o++] & 0xFFL) << 24) + + ((_random[_o++] & 0xFFL) << 32) + + ((_random[_o++] & 0xFFL) << 40) + + ((_random[_o++] & 0xFFL) << 48) + + (((long) _random[_o++]) << 56); + } + /** + * Simply read a double + * + * @return + * @throws IOException + */ double readDouble() throws IOException { return Double.longBitsToDouble( readLong() ); } - + /** + * Read the next byte from stream. + * + * @return + * @throws IOException + */ byte read() - throws IOException { - _read++; - return (byte)(_in.read() & 0xFF); + throws IOException { + // + // Ensure that one byte can be read + ensureContinuousBlock(1); + // + // Simply return the byte + return _random[_o++]; } void fill( byte b[] ) @@ -285,111 +406,300 @@ void fill( byte b[] ) } void fill( byte b[] , int len ) - throws IOException { - int off = 0; - while ( len > 0 ){ - int x = _in.read( b , off , len ); - _read += x; - off += x; - len -= x; + throws IOException { + // + // Take the remaining bytes from the buffer + int remaining = _l - _o; + // + // Did we alread read enough bytes? + if(remaining >= len) { + System.arraycopy(_random, _o, b, 0, len); + _o += len; + + return; + } + // + // Take the complete remaining bytes from buffer + if(remaining > 0) { + System.arraycopy(_random, _o, b, 0, remaining); + // + // Reduced needed bytes + len -= remaining; + // + // leave it up to the next ensure a continuous block + _o = _l; + } + // + // Read the rest direct from the InputStream + while ( len > 0 ) { + final int bytesRead = _in.read( b , remaining , len ); + // + // Reduced needed bytes + len -= bytesRead; + // + // Increase the number of read bytes because we reading directly from _in + _read += bytesRead; + + remaining += bytesRead; + } + } + /** + * Read a multibyte character with the first given as parameter c1. + * + * @param c1 + * @return + * @throws IOException + */ + int readMultiByte(int c1, int charBufferPosition) + throws IOException { + switch (c1 >> 4) { + case 12: + case 13: { + // + // We need at least one byte for the character and one for the null to terminate + assert charBufferPosition < _charBuffer.length; + ensureContinuousBlock(2); + // + // Read next byte and check for correctness + final int c2 = _random[_o++]; + + if ((c2 & 0xC0) != 0x80) + _charBuffer[charBufferPosition++] = '\uFFFD'; + else + _charBuffer[charBufferPosition++] = (char)(((c1 & 0x1F) << 6) | (c2 & 0x3F)); + + break; + } + case 14: { + // + // We need at least two bytes for the character and one for the null to terminate + assert charBufferPosition < _charBuffer.length; + ensureContinuousBlock(3); + // + // Read next bytes and check for correctness + final int c2 = _random[_o++]; + final int c3 = _random[_o++]; + + if (((c2 & 0xC0) != 0x80) || ((c3 & 0xC0) != 0x80)) + _charBuffer[charBufferPosition++] = '\uFFFD'; + else + _charBuffer[charBufferPosition++] = (char)(((c1 & 0x0F) << 12) | ((c2 & 0x3F) << 6) | ((c3 & 0x3F) << 0)); + + break; + } + case 15: { + // + // We need at least three bytes for the character and one for the null to terminate + ensureContinuousBlock(4); + // + // Read next bytes and check for correctness + final int c2 = _random[_o++]; + final int c3 = _random[_o++]; + final int c4 = _random[_o++]; + // Use a surrogate pair to represent it. + // ch is 0..fffff (20 bits) + final int ch = ((c1&0x7)<<18) + ((c2&0x3f)<<12) + ((c3&0x3f)<<6) + (c4&0x3f) - 0x10000; + + _charBuffer[charBufferPosition++] = (char) (0xd800 + (ch >> 10)); // top 10 bits + _charBuffer[charBufferPosition++] = (char) (0xdc00 + (ch & 0x3ff)); // bottom 10 bits + + break; + } + default: + _charBuffer[charBufferPosition++] = '\uFFFD'; } + + return charBufferPosition; } - - boolean _isAscii( byte b ){ - return b >=0 && b <= 127; - } + /** + * Read an null terminated string in UTF8 from {@link InputStream}. + * We assume that null terminated strings have small lengths and are mostly ascii. + * + * @return + * @throws IOException + */ String readCStr() - throws IOException { - - boolean isAcii = true; - - // short circuit 1 byte strings - { - _random[0] = read(); - if ( _random[0] == 0 ) - return ""; - - _random[1] = read(); - if ( _random[1] == 0 ){ - String out = ONE_BYTE_STRINGS[_random[0]]; - if ( out != null ) - return out; - return new String( _random , 0 , 1 , "UTF-8" ); - } - - _stringBuffer.reset(); - _stringBuffer.write( _random[0] ); - _stringBuffer.write( _random[1] ); + throws IOException { + // + // Position within _charBuffer + int charBufferPosition = 0; + // + // Claim a StringBuilder for bulding strings longer than charBuffer + StringBuilder stringBuilder = _stringBuilder.get(); + + if(stringBuilder == null) { + stringBuilder = new StringBuilder(_charBuffer.length * 2); + _stringBuilder = new SoftReference(stringBuilder); + } + else + stringBuilder.setLength(0); + // + // Fill the buffer with the first byte + ensureContinuousBlock(1); + + outer: + while ( true ) { + // + // This is the fast inner loop where every character is completely located in the buffer + // Since we read at maximum MAX_READAHEADSIZE and _charBuffer.length is greater than MAX_READAHEADSIZE + // there is no need to check for a buffer overflow in _charBuffer. + assert(_l - _o < _charBuffer.length - charBufferPosition); + + while(_o < _l) { + // + // Read next byte from buffer + final int b = _random[_o++]; + // + // Normal ascii character? Its the most common case + if( b > 0) { + // + // Append it to the end of our buffer + assert charBufferPosition < _charBuffer.length; + _charBuffer[charBufferPosition++] = (char)b; + } + else if( b == 0) { + break outer; + } + else { + // + // Read a multibyte. Its currently not optimized because this case is infrequent + charBufferPosition = readMultiByte(b & 0xff, charBufferPosition); + } + } + // + // We need more bytes in the buffer, at least one byte + ensureContinuousBlock(1); + // + // If there are to much characters in the buffer, then append _charBuffer to StringBuilder + // and reset the _charBuffer. This ensures that the byteBuffer does not rise a char buffer overflow + if(_l - _o > _charBuffer.length - charBufferPosition) { + stringBuilder.append(_charBuffer, 0, charBufferPosition); + charBufferPosition = 0; + } + } + // + // Some characters in _charBuffer + if(charBufferPosition > 0) { + // + // if string is empty then create the string direct from _charBuffer + if(stringBuilder.length() == 0) { + return allocateString( _charBuffer, charBufferPosition); + } + // + // Append _charBuffer to final string + stringBuilder.append(_charBuffer, 0, charBufferPosition); + } + // + // Ok, we got an empty string + if(stringBuilder.length() == 0) + return ""; + + return stringBuilder.toString(); + } + /** + * Allocate a string from a char[]. This method uses a string cache do reduce memory consumption. + * + * @param charBuffer + * @param length + * @return + */ + private String allocateString(final char charBuffer[], final int length) { + // + // We try to cache short strings + if(length < 16) { + // + // building a simple hash with the characters from charBuffer + int h = charBuffer[0] + 31; + for(int i = 1 ; i < length; i++) + h = h * 31 + charBuffer[i]; + // + // calculate index within hashtable + final int hashIndex = h & (_stringCache.length - 1); + // + // try to read cached string + String cachedString = _stringCache[hashIndex]; + // + // Found a cached string with correct length? + if(cachedString != null && cachedString.length() == length) { + int i = length - 1; + // + // Compare starting from the end + while(i >= 0) { + if(charBuffer[i] != cachedString.charAt(i)) + break; + + --i; + } + // + // if both are equal we can return the cached instance of this string + if(i < 0) + return cachedString; + } + // + // Write a new string to cache. overwrite any previous value + cachedString = new String(charBuffer, 0, length); + _stringCache[hashIndex] = cachedString; - isAcii = _isAscii( _random[0] ) && _isAscii( _random[1] ); - } - - - while ( true ){ - byte b = read(); - if ( b == 0 ) - break; - _stringBuffer.write( b ); - isAcii = isAcii && _isAscii( b ); - } + return cachedString; + } - String out = null; - if ( isAcii ){ - out = _stringBuffer.asString(); - } - else { - try { - out = _stringBuffer.asString( "UTF-8" ); - } - catch ( UnsupportedOperationException e ){ - throw new RuntimeException( "impossible" , e ); - } - } - _stringBuffer.reset(); - return out; + return new String(charBuffer, 0, length); } - + /** + * Read an UTF8-String from {@link InputStream}. + * + * @return + * @throws IOException + */ String readUTF8String() throws IOException { - int size = readInt(); + // + // Read size and ensure that the complete string is in the buffer + final int size = readInt(); if ( size < 0 || size > ( 3 * 1024 * 1024 ) ) throw new RuntimeException( "bad string size: " + size ); - byte[] b = size < _random.length ? _random : new byte[size]; - - fill( b , size ); + ensureContinuousBlock(size); + // + // Start of the string is the current pointer in buffer + final int startOfString = _o; + // + // Increase offset by size of string + _o += size; + try { - return new String( b , 0 , size - 1 , "UTF-8" ); + return new String( _random, startOfString , size - 1 , "UTF-8" ); } catch ( java.io.UnsupportedEncodingException uee ){ throw new RuntimeException( "impossible" , uee ); } } + /** + * Returns the number of bytes read so far. + * + * @return + */ + int getBytesRead() { + return _read + _o; + } + int _o; + int _l; int _read; + final InputStream _in; + int _length; } - private Input _in; private BSONCallback _callback; - private byte[] _random = new byte[1024]; // has to be used within a single function - - private PoolOutputBuffer _stringBuffer = new PoolOutputBuffer(); - - static final String[] ONE_BYTE_STRINGS = new String[128]; - static void _fillRange( byte min, byte max ){ - while ( min < max ){ - String s = ""; - s += (char)min; - ONE_BYTE_STRINGS[(int)min] = s; - min++; - } - } - static { - _fillRange( (byte)'0' , (byte)'9' ); - _fillRange( (byte)'a' , (byte)'z' ); - _fillRange( (byte)'A' , (byte)'Z' ); - } + private byte[] _random = new byte[1024]; + private char _charBuffer[] = new char[1024]; + + private static String _stringCache[] = new String[1024]; + /** + * {@link SoftReference} to {@link StringBuilder} to allow reclaiming of memory by GC + */ + private SoftReference _stringBuilder = new SoftReference(null); }