Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/arrays/integer.py : 25%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import numbers
2from typing import Any, Tuple, Type
3import warnings
5import numpy as np
7from pandas._libs import lib, missing as libmissing
8from pandas.compat import set_function_name
9from pandas.util._decorators import cache_readonly
11from pandas.core.dtypes.base import ExtensionDtype
12from pandas.core.dtypes.cast import astype_nansafe
13from pandas.core.dtypes.common import (
14 is_bool_dtype,
15 is_float,
16 is_float_dtype,
17 is_integer,
18 is_integer_dtype,
19 is_list_like,
20 is_object_dtype,
21 is_scalar,
22 pandas_dtype,
23)
24from pandas.core.dtypes.dtypes import register_extension_dtype
25from pandas.core.dtypes.missing import isna
27from pandas.core import nanops, ops
28from pandas.core.indexers import check_array_indexer
29from pandas.core.ops import invalid_comparison
30from pandas.core.ops.common import unpack_zerodim_and_defer
31from pandas.core.tools.numeric import to_numeric
33from .masked import BaseMaskedArray
36class _IntegerDtype(ExtensionDtype):
37 """
38 An ExtensionDtype to hold a single size & kind of integer dtype.
40 These specific implementations are subclasses of the non-public
41 _IntegerDtype. For example we have Int8Dtype to represent signed int 8s.
43 The attributes name & type are set when these subclasses are created.
44 """
46 name: str
47 base = None
48 type: Type
49 na_value = libmissing.NA
51 def __repr__(self) -> str:
52 sign = "U" if self.is_unsigned_integer else ""
53 return f"{sign}Int{8 * self.itemsize}Dtype()"
55 @cache_readonly
56 def is_signed_integer(self):
57 return self.kind == "i"
59 @cache_readonly
60 def is_unsigned_integer(self):
61 return self.kind == "u"
63 @property
64 def _is_numeric(self):
65 return True
67 @cache_readonly
68 def numpy_dtype(self):
69 """ Return an instance of our numpy dtype """
70 return np.dtype(self.type)
72 @cache_readonly
73 def kind(self):
74 return self.numpy_dtype.kind
76 @cache_readonly
77 def itemsize(self):
78 """ Return the number of bytes in this dtype """
79 return self.numpy_dtype.itemsize
81 @classmethod
82 def construct_array_type(cls):
83 """
84 Return the array type associated with this dtype.
86 Returns
87 -------
88 type
89 """
90 return IntegerArray
92 def __from_arrow__(self, array):
93 """Construct IntegerArray from passed pyarrow Array/ChunkedArray"""
94 import pyarrow
95 from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
97 pyarrow_type = pyarrow.from_numpy_dtype(self.type)
98 if not array.type.equals(pyarrow_type):
99 array = array.cast(pyarrow_type)
101 if isinstance(array, pyarrow.Array):
102 chunks = [array]
103 else:
104 # pyarrow.ChunkedArray
105 chunks = array.chunks
107 results = []
108 for arr in chunks:
109 data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type)
110 int_arr = IntegerArray(data.copy(), ~mask, copy=False)
111 results.append(int_arr)
113 return IntegerArray._concat_same_type(results)
116def integer_array(values, dtype=None, copy=False):
117 """
118 Infer and return an integer array of the values.
120 Parameters
121 ----------
122 values : 1D list-like
123 dtype : dtype, optional
124 dtype to coerce
125 copy : bool, default False
127 Returns
128 -------
129 IntegerArray
131 Raises
132 ------
133 TypeError if incompatible types
134 """
135 values, mask = coerce_to_array(values, dtype=dtype, copy=copy)
136 return IntegerArray(values, mask)
139def safe_cast(values, dtype, copy):
140 """
141 Safely cast the values to the dtype if they
142 are equivalent, meaning floats must be equivalent to the
143 ints.
145 """
147 try:
148 return values.astype(dtype, casting="safe", copy=copy)
149 except TypeError:
151 casted = values.astype(dtype, copy=copy)
152 if (casted == values).all():
153 return casted
155 raise TypeError(
156 f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
157 )
160def coerce_to_array(values, dtype, mask=None, copy=False):
161 """
162 Coerce the input values array to numpy arrays with a mask
164 Parameters
165 ----------
166 values : 1D list-like
167 dtype : integer dtype
168 mask : bool 1D array, optional
169 copy : bool, default False
170 if True, copy the input
172 Returns
173 -------
174 tuple of (values, mask)
175 """
176 # if values is integer numpy array, preserve it's dtype
177 if dtype is None and hasattr(values, "dtype"):
178 if is_integer_dtype(values.dtype):
179 dtype = values.dtype
181 if dtype is not None:
182 if isinstance(dtype, str) and (
183 dtype.startswith("Int") or dtype.startswith("UInt")
184 ):
185 # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
186 # https://github.com/numpy/numpy/pull/7476
187 dtype = dtype.lower()
189 if not issubclass(type(dtype), _IntegerDtype):
190 try:
191 dtype = _dtypes[str(np.dtype(dtype))]
192 except KeyError:
193 raise ValueError(f"invalid dtype specified {dtype}")
195 if isinstance(values, IntegerArray):
196 values, mask = values._data, values._mask
197 if dtype is not None:
198 values = values.astype(dtype.numpy_dtype, copy=False)
200 if copy:
201 values = values.copy()
202 mask = mask.copy()
203 return values, mask
205 values = np.array(values, copy=copy)
206 if is_object_dtype(values):
207 inferred_type = lib.infer_dtype(values, skipna=True)
208 if inferred_type == "empty":
209 values = np.empty(len(values))
210 values.fill(np.nan)
211 elif inferred_type not in [
212 "floating",
213 "integer",
214 "mixed-integer",
215 "integer-na",
216 "mixed-integer-float",
217 ]:
218 raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
220 elif is_bool_dtype(values) and is_integer_dtype(dtype):
221 values = np.array(values, dtype=int, copy=copy)
223 elif not (is_integer_dtype(values) or is_float_dtype(values)):
224 raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
226 if mask is None:
227 mask = isna(values)
228 else:
229 assert len(mask) == len(values)
231 if not values.ndim == 1:
232 raise TypeError("values must be a 1D list-like")
233 if not mask.ndim == 1:
234 raise TypeError("mask must be a 1D list-like")
236 # infer dtype if needed
237 if dtype is None:
238 dtype = np.dtype("int64")
239 else:
240 dtype = dtype.type
242 # if we are float, let's make sure that we can
243 # safely cast
245 # we copy as need to coerce here
246 if mask.any():
247 values = values.copy()
248 values[mask] = 1
249 values = safe_cast(values, dtype, copy=False)
250 else:
251 values = safe_cast(values, dtype, copy=False)
253 return values, mask
256class IntegerArray(BaseMaskedArray):
257 """
258 Array of integer (optional missing) values.
260 .. versionadded:: 0.24.0
262 .. versionchanged:: 1.0.0
264 Now uses :attr:`pandas.NA` as the missing value rather
265 than :attr:`numpy.nan`.
267 .. warning::
269 IntegerArray is currently experimental, and its API or internal
270 implementation may change without warning.
272 We represent an IntegerArray with 2 numpy arrays:
274 - data: contains a numpy integer array of the appropriate dtype
275 - mask: a boolean array holding a mask on the data, True is missing
277 To construct an IntegerArray from generic array-like input, use
278 :func:`pandas.array` with one of the integer dtypes (see examples).
280 See :ref:`integer_na` for more.
282 Parameters
283 ----------
284 values : numpy.ndarray
285 A 1-d integer-dtype array.
286 mask : numpy.ndarray
287 A 1-d boolean-dtype array indicating missing values.
288 copy : bool, default False
289 Whether to copy the `values` and `mask`.
291 Attributes
292 ----------
293 None
295 Methods
296 -------
297 None
299 Returns
300 -------
301 IntegerArray
303 Examples
304 --------
305 Create an IntegerArray with :func:`pandas.array`.
307 >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
308 >>> int_array
309 <IntegerArray>
310 [1, <NA>, 3]
311 Length: 3, dtype: Int32
313 String aliases for the dtypes are also available. They are capitalized.
315 >>> pd.array([1, None, 3], dtype='Int32')
316 <IntegerArray>
317 [1, <NA>, 3]
318 Length: 3, dtype: Int32
320 >>> pd.array([1, None, 3], dtype='UInt16')
321 <IntegerArray>
322 [1, <NA>, 3]
323 Length: 3, dtype: UInt16
324 """
326 # The value used to fill '_data' to avoid upcasting
327 _internal_fill_value = 1
329 @cache_readonly
330 def dtype(self):
331 return _dtypes[str(self._data.dtype)]
333 def __init__(self, values, mask, copy=False):
334 if not (isinstance(values, np.ndarray) and is_integer_dtype(values.dtype)):
335 raise TypeError(
336 "values should be integer numpy array. Use "
337 "the 'integer_array' function instead"
338 )
339 if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
340 raise TypeError(
341 "mask should be boolean numpy array. Use "
342 "the 'integer_array' function instead"
343 )
345 if copy:
346 values = values.copy()
347 mask = mask.copy()
349 self._data = values
350 self._mask = mask
352 @classmethod
353 def _from_sequence(cls, scalars, dtype=None, copy=False):
354 return integer_array(scalars, dtype=dtype, copy=copy)
356 @classmethod
357 def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
358 scalars = to_numeric(strings, errors="raise")
359 return cls._from_sequence(scalars, dtype, copy)
361 @classmethod
362 def _from_factorized(cls, values, original):
363 return integer_array(values, dtype=original.dtype)
365 _HANDLED_TYPES = (np.ndarray, numbers.Number)
367 def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
368 # For IntegerArray inputs, we apply the ufunc to ._data
369 # and mask the result.
370 if method == "reduce":
371 # Not clear how to handle missing values in reductions. Raise.
372 raise NotImplementedError("The 'reduce' method is not supported.")
373 out = kwargs.get("out", ())
375 for x in inputs + out:
376 if not isinstance(x, self._HANDLED_TYPES + (IntegerArray,)):
377 return NotImplemented
379 # for binary ops, use our custom dunder methods
380 result = ops.maybe_dispatch_ufunc_to_dunder_op(
381 self, ufunc, method, *inputs, **kwargs
382 )
383 if result is not NotImplemented:
384 return result
386 mask = np.zeros(len(self), dtype=bool)
387 inputs2 = []
388 for x in inputs:
389 if isinstance(x, IntegerArray):
390 mask |= x._mask
391 inputs2.append(x._data)
392 else:
393 inputs2.append(x)
395 def reconstruct(x):
396 # we don't worry about scalar `x` here, since we
397 # raise for reduce up above.
399 if is_integer_dtype(x.dtype):
400 m = mask.copy()
401 return IntegerArray(x, m)
402 else:
403 x[mask] = np.nan
404 return x
406 result = getattr(ufunc, method)(*inputs2, **kwargs)
407 if isinstance(result, tuple):
408 tuple(reconstruct(x) for x in result)
409 else:
410 return reconstruct(result)
412 def __setitem__(self, key, value):
413 _is_scalar = is_scalar(value)
414 if _is_scalar:
415 value = [value]
416 value, mask = coerce_to_array(value, dtype=self.dtype)
418 if _is_scalar:
419 value = value[0]
420 mask = mask[0]
422 key = check_array_indexer(self, key)
423 self._data[key] = value
424 self._mask[key] = mask
426 def astype(self, dtype, copy=True):
427 """
428 Cast to a NumPy array or IntegerArray with 'dtype'.
430 Parameters
431 ----------
432 dtype : str or dtype
433 Typecode or data-type to which the array is cast.
434 copy : bool, default True
435 Whether to copy the data, even if not necessary. If False,
436 a copy is made only if the old dtype does not match the
437 new dtype.
439 Returns
440 -------
441 array : ndarray or IntegerArray
442 NumPy ndarray or IntergerArray with 'dtype' for its dtype.
444 Raises
445 ------
446 TypeError
447 if incompatible type with an IntegerDtype, equivalent of same_kind
448 casting
449 """
450 from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
452 dtype = pandas_dtype(dtype)
454 # if we are astyping to an existing IntegerDtype we can fastpath
455 if isinstance(dtype, _IntegerDtype):
456 result = self._data.astype(dtype.numpy_dtype, copy=False)
457 return type(self)(result, mask=self._mask, copy=False)
458 elif isinstance(dtype, BooleanDtype):
459 result = self._data.astype("bool", copy=False)
460 return BooleanArray(result, mask=self._mask, copy=False)
462 # coerce
463 if is_float_dtype(dtype):
464 # In astype, we consider dtype=float to also mean na_value=np.nan
465 kwargs = dict(na_value=np.nan)
466 else:
467 kwargs = {}
469 data = self.to_numpy(dtype=dtype, **kwargs)
470 return astype_nansafe(data, dtype, copy=False)
472 @property
473 def _ndarray_values(self) -> np.ndarray:
474 """Internal pandas method for lossy conversion to a NumPy ndarray.
476 This method is not part of the pandas interface.
478 The expectation is that this is cheap to compute, and is primarily
479 used for interacting with our indexers.
480 """
481 return self._data
483 def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
484 # TODO: https://github.com/pandas-dev/pandas/issues/30037
485 # use masked algorithms, rather than object-dtype / np.nan.
486 return self.to_numpy(na_value=np.nan), np.nan
488 def _values_for_argsort(self) -> np.ndarray:
489 """Return values for sorting.
491 Returns
492 -------
493 ndarray
494 The transformed values should maintain the ordering between values
495 within the array.
497 See Also
498 --------
499 ExtensionArray.argsort
500 """
501 data = self._data.copy()
502 data[self._mask] = data.min() - 1
503 return data
505 @classmethod
506 def _create_comparison_method(cls, op):
507 op_name = op.__name__
509 @unpack_zerodim_and_defer(op.__name__)
510 def cmp_method(self, other):
511 from pandas.arrays import BooleanArray
513 mask = None
515 if isinstance(other, (BooleanArray, IntegerArray)):
516 other, mask = other._data, other._mask
518 elif is_list_like(other):
519 other = np.asarray(other)
520 if other.ndim > 1:
521 raise NotImplementedError(
522 "can only perform ops with 1-d structures"
523 )
524 if len(self) != len(other):
525 raise ValueError("Lengths must match to compare")
527 if other is libmissing.NA:
528 # numpy does not handle pd.NA well as "other" scalar (it returns
529 # a scalar False instead of an array)
530 # This may be fixed by NA.__array_ufunc__. Revisit this check
531 # once that's implemented.
532 result = np.zeros(self._data.shape, dtype="bool")
533 mask = np.ones(self._data.shape, dtype="bool")
534 else:
535 with warnings.catch_warnings():
536 # numpy may show a FutureWarning:
537 # elementwise comparison failed; returning scalar instead,
538 # but in the future will perform elementwise comparison
539 # before returning NotImplemented. We fall back to the correct
540 # behavior today, so that should be fine to ignore.
541 warnings.filterwarnings("ignore", "elementwise", FutureWarning)
542 with np.errstate(all="ignore"):
543 method = getattr(self._data, f"__{op_name}__")
544 result = method(other)
546 if result is NotImplemented:
547 result = invalid_comparison(self._data, other, op)
549 # nans propagate
550 if mask is None:
551 mask = self._mask.copy()
552 else:
553 mask = self._mask | mask
555 return BooleanArray(result, mask)
557 name = f"__{op.__name__}__"
558 return set_function_name(cmp_method, name, cls)
560 def _reduce(self, name, skipna=True, **kwargs):
561 data = self._data
562 mask = self._mask
564 # coerce to a nan-aware float if needed
565 # (we explicitly use NaN within reductions)
566 if self._hasna:
567 data = self.to_numpy("float64", na_value=np.nan)
569 op = getattr(nanops, "nan" + name)
570 result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
572 if np.isnan(result):
573 return libmissing.NA
575 # if we have a boolean op, don't coerce
576 if name in ["any", "all"]:
577 pass
579 # if we have a preservable numeric op,
580 # provide coercion back to an integer type if possible
581 elif name in ["sum", "min", "max", "prod"]:
582 int_result = int(result)
583 if int_result == result:
584 result = int_result
586 return result
588 def _maybe_mask_result(self, result, mask, other, op_name):
589 """
590 Parameters
591 ----------
592 result : array-like
593 mask : array-like bool
594 other : scalar or array-like
595 op_name : str
596 """
598 # if we have a float operand we are by-definition
599 # a float result
600 # or our op is a divide
601 if (is_float_dtype(other) or is_float(other)) or (
602 op_name in ["rtruediv", "truediv"]
603 ):
604 result[mask] = np.nan
605 return result
607 return type(self)(result, mask, copy=False)
609 @classmethod
610 def _create_arithmetic_method(cls, op):
611 op_name = op.__name__
613 @unpack_zerodim_and_defer(op.__name__)
614 def integer_arithmetic_method(self, other):
616 omask = None
618 if getattr(other, "ndim", 0) > 1:
619 raise NotImplementedError("can only perform ops with 1-d structures")
621 if isinstance(other, IntegerArray):
622 other, omask = other._data, other._mask
624 elif is_list_like(other):
625 other = np.asarray(other)
626 if other.ndim > 1:
627 raise NotImplementedError(
628 "can only perform ops with 1-d structures"
629 )
630 if len(self) != len(other):
631 raise ValueError("Lengths must match")
632 if not (is_float_dtype(other) or is_integer_dtype(other)):
633 raise TypeError("can only perform ops with numeric values")
635 else:
636 if not (is_float(other) or is_integer(other) or other is libmissing.NA):
637 raise TypeError("can only perform ops with numeric values")
639 if omask is None:
640 mask = self._mask.copy()
641 if other is libmissing.NA:
642 mask |= True
643 else:
644 mask = self._mask | omask
646 if op_name == "pow":
647 # 1 ** x is 1.
648 mask = np.where((self._data == 1) & ~self._mask, False, mask)
649 # x ** 0 is 1.
650 if omask is not None:
651 mask = np.where((other == 0) & ~omask, False, mask)
652 elif other is not libmissing.NA:
653 mask = np.where(other == 0, False, mask)
655 elif op_name == "rpow":
656 # 1 ** x is 1.
657 if omask is not None:
658 mask = np.where((other == 1) & ~omask, False, mask)
659 elif other is not libmissing.NA:
660 mask = np.where(other == 1, False, mask)
661 # x ** 0 is 1.
662 mask = np.where((self._data == 0) & ~self._mask, False, mask)
664 if other is libmissing.NA:
665 result = np.ones_like(self._data)
666 else:
667 with np.errstate(all="ignore"):
668 result = op(self._data, other)
670 # divmod returns a tuple
671 if op_name == "divmod":
672 div, mod = result
673 return (
674 self._maybe_mask_result(div, mask, other, "floordiv"),
675 self._maybe_mask_result(mod, mask, other, "mod"),
676 )
678 return self._maybe_mask_result(result, mask, other, op_name)
680 name = f"__{op.__name__}__"
681 return set_function_name(integer_arithmetic_method, name, cls)
684IntegerArray._add_arithmetic_ops()
685IntegerArray._add_comparison_ops()
688_dtype_docstring = """
689An ExtensionDtype for {dtype} integer data.
691.. versionchanged:: 1.0.0
693 Now uses :attr:`pandas.NA` as its missing value,
694 rather than :attr:`numpy.nan`.
696Attributes
697----------
698None
700Methods
701-------
702None
703"""
705# create the Dtype
706Int8Dtype = register_extension_dtype(
707 type(
708 "Int8Dtype",
709 (_IntegerDtype,),
710 {
711 "type": np.int8,
712 "name": "Int8",
713 "__doc__": _dtype_docstring.format(dtype="int8"),
714 },
715 )
716)
718Int16Dtype = register_extension_dtype(
719 type(
720 "Int16Dtype",
721 (_IntegerDtype,),
722 {
723 "type": np.int16,
724 "name": "Int16",
725 "__doc__": _dtype_docstring.format(dtype="int16"),
726 },
727 )
728)
730Int32Dtype = register_extension_dtype(
731 type(
732 "Int32Dtype",
733 (_IntegerDtype,),
734 {
735 "type": np.int32,
736 "name": "Int32",
737 "__doc__": _dtype_docstring.format(dtype="int32"),
738 },
739 )
740)
742Int64Dtype = register_extension_dtype(
743 type(
744 "Int64Dtype",
745 (_IntegerDtype,),
746 {
747 "type": np.int64,
748 "name": "Int64",
749 "__doc__": _dtype_docstring.format(dtype="int64"),
750 },
751 )
752)
754UInt8Dtype = register_extension_dtype(
755 type(
756 "UInt8Dtype",
757 (_IntegerDtype,),
758 {
759 "type": np.uint8,
760 "name": "UInt8",
761 "__doc__": _dtype_docstring.format(dtype="uint8"),
762 },
763 )
764)
766UInt16Dtype = register_extension_dtype(
767 type(
768 "UInt16Dtype",
769 (_IntegerDtype,),
770 {
771 "type": np.uint16,
772 "name": "UInt16",
773 "__doc__": _dtype_docstring.format(dtype="uint16"),
774 },
775 )
776)
778UInt32Dtype = register_extension_dtype(
779 type(
780 "UInt32Dtype",
781 (_IntegerDtype,),
782 {
783 "type": np.uint32,
784 "name": "UInt32",
785 "__doc__": _dtype_docstring.format(dtype="uint32"),
786 },
787 )
788)
790UInt64Dtype = register_extension_dtype(
791 type(
792 "UInt64Dtype",
793 (_IntegerDtype,),
794 {
795 "type": np.uint64,
796 "name": "UInt64",
797 "__doc__": _dtype_docstring.format(dtype="uint64"),
798 },
799 )
800)
802_dtypes = {
803 "int8": Int8Dtype(),
804 "int16": Int16Dtype(),
805 "int32": Int32Dtype(),
806 "int64": Int64Dtype(),
807 "uint8": UInt8Dtype(),
808 "uint16": UInt16Dtype(),
809 "uint32": UInt32Dtype(),
810 "uint64": UInt64Dtype(),
811}