Merge pull request #1530 from dhermes/happybase-table-helpers

dhermes · dhermes · commit f87ee7e93510 · 2016-02-25T09:43:58.000-08:00
Adding HappyBase table helpers.
diff --git a/gcloud/bigtable/happybase/table.py b/gcloud/bigtable/happybase/table.py
@@ -19,13 +19,17 @@
 
 import six
 
+from gcloud._helpers import _datetime_from_microseconds
+from gcloud._helpers import _microseconds_from_datetime
+from gcloud._helpers import _to_bytes
 from gcloud._helpers import _total_seconds
 from gcloud.bigtable.column_family import GCRuleIntersection
 from gcloud.bigtable.column_family import MaxAgeGCRule
 from gcloud.bigtable.column_family import MaxVersionsGCRule
 from gcloud.bigtable.happybase.batch import _WAL_SENTINEL
 from gcloud.bigtable.happybase.batch import Batch
 from gcloud.bigtable.table import Table as _LowLevelTable
+from gcloud.bigtable.row import TimestampRange
 
 
 _UNPACK_I64 = struct.Struct('>q').unpack
@@ -563,3 +567,155 @@ def _gc_rule_to_dict(gc_rule):
                 if key1 != key2:
                     result = {key1: rule1[key1], key2: rule2[key2]}
     return result
+
+
+def _next_char(str_val, index):
+    """Gets the next character based on a position in a string.
+
+    :type str_val: str
+    :param str_val: A string containing the character to update.
+
+    :type index: int
+    :param index: An integer index in ``str_val``.
+
+    :rtype: str
+    :returns: The next character after the character at ``index``
+              in ``str_val``.
+    """
+    ord_val = six.indexbytes(str_val, index)
+    return _to_bytes(chr(ord_val + 1), encoding='latin-1')
+
+
+def _string_successor(str_val):
+    """Increment and truncate a byte string.
+
+    Determines shortest string that sorts after the given string when
+    compared using regular string comparison semantics.
+
+    Modeled after implementation in ``gcloud-golang``.
+
+    Increments the last byte that is smaller than ``0xFF``, and
+    drops everything after it. If the string only contains ``0xFF`` bytes,
+    ``''`` is returned.
+
+    :type str_val: str
+    :param str_val: String to increment.
+
+    :rtype: str
+    :returns: The next string in lexical order after ``str_val``.
+    """
+    str_val = _to_bytes(str_val, encoding='latin-1')
+    if str_val == b'':
+        return str_val
+
+    index = len(str_val) - 1
+    while index >= 0:
+        if six.indexbytes(str_val, index) != 0xff:
+            break
+        index -= 1
+
+    if index == -1:
+        return b''
+
+    return str_val[:index] + _next_char(str_val, index)
+
+
+def _convert_to_time_range(timestamp=None):
+    """Create a timestamp range from an HBase / HappyBase timestamp.
+
+    HBase uses timestamp as an argument to specify an exclusive end
+    deadline. Cloud Bigtable also uses exclusive end times, so
+    the behavior matches.
+
+    :type timestamp: int
+    :param timestamp: (Optional) Timestamp (in milliseconds since the
+                      epoch). Intended to be used as the end of an HBase
+                      time range, which is exclusive.
+
+    :rtype: :class:`.TimestampRange`, :data:`NoneType <types.NoneType>`
+    :returns: The timestamp range corresponding to the passed in
+              ``timestamp``.
+    """
+    if timestamp is None:
+        return None
+
+    next_timestamp = _datetime_from_microseconds(1000 * timestamp)
+    return TimestampRange(end=next_timestamp)
+
+
+def _cells_to_pairs(cells, include_timestamp=False):
+    """Converts list of cells to HappyBase format.
+
+    For example::
+
+      >>> import datetime
+      >>> from gcloud.bigtable.row_data import Cell
+      >>> cell1 = Cell(b'val1', datetime.datetime.utcnow())
+      >>> cell2 = Cell(b'val2', datetime.datetime.utcnow())
+      >>> _cells_to_pairs([cell1, cell2])
+      [b'val1', b'val2']
+      >>> _cells_to_pairs([cell1, cell2], include_timestamp=True)
+      [(b'val1', 1456361486255), (b'val2', 1456361491927)]
+
+    :type cells: list
+    :param cells: List of :class:`.Cell` returned from a read request.
+
+    :type include_timestamp: bool
+    :param include_timestamp: Flag to indicate if cell timestamps should be
+                              included with the output.
+
+    :rtype: list
+    :returns: List of values in the cell. If ``include_timestamp=True``, each
+              value will be a pair, with the first part the bytes value in
+              the cell and the second part the number of milliseconds in the
+              timestamp on the cell.
+    """
+    result = []
+    for cell in cells:
+        if include_timestamp:
+            ts_millis = _microseconds_from_datetime(cell.timestamp) // 1000
+            result.append((cell.value, ts_millis))
+        else:
+            result.append(cell.value)
+    return result
+
+
+def _partial_row_to_dict(partial_row_data, include_timestamp=False):
+    """Convert a low-level row data object to a dictionary.
+
+    Assumes only the latest value in each row is needed. This assumption
+    is due to the fact that this method is used by callers which use
+    a ``CellsColumnLimitFilter(1)`` filter.
+
+    For example::
+
+      >>> import datetime
+      >>> from gcloud.bigtable.row_data import Cell, PartialRowData
+      >>> cell1 = Cell(b'val1', datetime.datetime.utcnow())
+      >>> cell2 = Cell(b'val2', datetime.datetime.utcnow())
+      >>> row_data = PartialRowData(b'row-key')
+      >>> _partial_row_to_dict(row_data)
+      {}
+      >>> row_data._cells[u'fam1'] = {b'col1': [cell1], b'col2': [cell2]}
+      >>> _partial_row_to_dict(row_data)
+      {b'fam1:col2': b'val2', b'fam1:col1': b'val1'}
+      >>> _partial_row_to_dict(row_data, include_timestamp=True)
+      {b'fam1:col2': (b'val2', 1456361724480),
+       b'fam1:col1': (b'val1', 1456361721135)}
+
+    :type partial_row_data: :class:`.row_data.PartialRowData`
+    :param partial_row_data: Row data consumed from a stream.
+
+    :type include_timestamp: bool
+    :param include_timestamp: Flag to indicate if cell timestamps should be
+                              included with the output.
+
+    :rtype: dict
+    :returns: The row data converted to a dictionary.
+    """
+    result = {}
+    for column, cells in six.iteritems(partial_row_data.to_dict()):
+        cell_vals = _cells_to_pairs(cells,
+                                    include_timestamp=include_timestamp)
+        result[column] = cell_vals[0]
+    return result
diff --git a/gcloud/bigtable/happybase/test_table.py b/gcloud/bigtable/happybase/test_table.py
@@ -443,6 +443,142 @@ def test_with_intersection_two_nested_rules(self):
         self.assertTrue(result is gc_rule)
 
 
+class Test__string_successor(unittest2.TestCase):
+
+    def _callFUT(self, *args, **kwargs):
+        from gcloud.bigtable.happybase.table import _string_successor
+        return _string_successor(*args, **kwargs)
+
+    def test_with_alphanumeric(self):
+        self.assertEqual(self._callFUT(b'boa'), b'bob')
+        self.assertEqual(self._callFUT(b'abc1'), b'abc2')
+
+    def test_with_last_byte(self):
+        self.assertEqual(self._callFUT(b'boa\xff'), b'bob')
+
+    def test_with_empty_string(self):
+        self.assertEqual(self._callFUT(b''), b'')
+
+    def test_with_all_last_bytes(self):
+        self.assertEqual(self._callFUT(b'\xff\xff\xff'), b'')
+
+    def test_with_unicode_input(self):
+        self.assertEqual(self._callFUT(u'boa'), b'bob')
+
+
+class Test__convert_to_time_range(unittest2.TestCase):
+
+    def _callFUT(self, timestamp=None):
+        from gcloud.bigtable.happybase.table import _convert_to_time_range
+        return _convert_to_time_range(timestamp=timestamp)
+
+    def test_null(self):
+        timestamp = None
+        result = self._callFUT(timestamp=timestamp)
+        self.assertEqual(result, None)
+
+    def test_invalid_type(self):
+        timestamp = object()
+        with self.assertRaises(TypeError):
+            self._callFUT(timestamp=timestamp)
+
+    def test_success(self):
+        from gcloud._helpers import _datetime_from_microseconds
+        from gcloud.bigtable.row import TimestampRange
+
+        timestamp = 1441928298571
+        ts_dt = _datetime_from_microseconds(1000 * timestamp)
+        result = self._callFUT(timestamp=timestamp)
+        self.assertTrue(isinstance(result, TimestampRange))
+        self.assertEqual(result.start, None)
+        self.assertEqual(result.end, ts_dt)
+
+
+class Test__cells_to_pairs(unittest2.TestCase):
+
+    def _callFUT(self, *args, **kwargs):
+        from gcloud.bigtable.happybase.table import _cells_to_pairs
+        return _cells_to_pairs(*args, **kwargs)
+
+    def test_without_timestamp(self):
+        from gcloud.bigtable.row_data import Cell
+
+        value1 = 'foo'
+        cell1 = Cell(value=value1, timestamp=None)
+        value2 = 'bar'
+        cell2 = Cell(value=value2, timestamp=None)
+
+        result = self._callFUT([cell1, cell2])
+        self.assertEqual(result, [value1, value2])
+
+    def test_with_timestamp(self):
+        from gcloud._helpers import _datetime_from_microseconds
+        from gcloud.bigtable.row_data import Cell
+
+        value1 = 'foo'
+        ts1_millis = 1221934570148
+        ts1 = _datetime_from_microseconds(ts1_millis * 1000)
+        cell1 = Cell(value=value1, timestamp=ts1)
+
+        value2 = 'bar'
+        ts2_millis = 1221955575548
+        ts2 = _datetime_from_microseconds(ts2_millis * 1000)
+        cell2 = Cell(value=value2, timestamp=ts2)
+
+        result = self._callFUT([cell1, cell2], include_timestamp=True)
+        self.assertEqual(result,
+                         [(value1, ts1_millis), (value2, ts2_millis)])
+
+
+class Test__partial_row_to_dict(unittest2.TestCase):
+
+    def _callFUT(self, partial_row_data, include_timestamp=False):
+        from gcloud.bigtable.happybase.table import _partial_row_to_dict
+        return _partial_row_to_dict(partial_row_data,
+                                    include_timestamp=include_timestamp)
+
+    def test_without_timestamp(self):
+        from gcloud.bigtable.row_data import Cell
+        from gcloud.bigtable.row_data import PartialRowData
+
+        row_data = PartialRowData(b'row-key')
+        val1 = b'hi-im-bytes'
+        val2 = b'bi-im-hytes'
+        row_data._cells[u'fam1'] = {
+            b'col1': [Cell(val1, None)],
+            b'col2': [Cell(val2, None)],
+        }
+        result = self._callFUT(row_data)
+        expected_result = {
+            b'fam1:col1': val1,
+            b'fam1:col2': val2,
+        }
+        self.assertEqual(result, expected_result)
+
+    def test_with_timestamp(self):
+        from gcloud._helpers import _datetime_from_microseconds
+        from gcloud.bigtable.row_data import Cell
+        from gcloud.bigtable.row_data import PartialRowData
+
+        row_data = PartialRowData(b'row-key')
+        val1 = b'hi-im-bytes'
+        ts1_millis = 1221934570148
+        ts1 = _datetime_from_microseconds(ts1_millis * 1000)
+        val2 = b'bi-im-hytes'
+        ts2_millis = 1331934880000
+        ts2 = _datetime_from_microseconds(ts2_millis * 1000)
+        row_data._cells[u'fam1'] = {
+            b'col1': [Cell(val1, ts1)],
+            b'col2': [Cell(val2, ts2)],
+        }
+        result = self._callFUT(row_data, include_timestamp=True)
+        expected_result = {
+            b'fam1:col1': (val1, ts1_millis),
+            b'fam1:col2': (val2, ts2_millis),
+        }
+        self.assertEqual(result, expected_result)
+
+
 class _Connection(object):
 
     def __init__(self, cluster):