Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/base.py : 25%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Base and utility classes for pandas objects.
3"""
4import builtins
5import textwrap
6from typing import Dict, FrozenSet, List, Optional
8import numpy as np
10import pandas._libs.lib as lib
11from pandas.compat import PYPY
12from pandas.compat.numpy import function as nv
13from pandas.errors import AbstractMethodError
14from pandas.util._decorators import Appender, Substitution, cache_readonly
15from pandas.util._validators import validate_bool_kwarg
17from pandas.core.dtypes.cast import is_nested_object
18from pandas.core.dtypes.common import (
19 is_categorical_dtype,
20 is_dict_like,
21 is_extension_array_dtype,
22 is_list_like,
23 is_object_dtype,
24 is_scalar,
25 needs_i8_conversion,
26)
27from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
28from pandas.core.dtypes.missing import isna
30from pandas.core import algorithms, common as com
31from pandas.core.accessor import DirNamesMixin
32from pandas.core.algorithms import duplicated, unique1d, value_counts
33from pandas.core.arrays import ExtensionArray
34from pandas.core.construction import create_series_with_explicit_dtype
35import pandas.core.nanops as nanops
37_shared_docs: Dict[str, str] = dict()
38_indexops_doc_kwargs = dict(
39 klass="IndexOpsMixin",
40 inplace="",
41 unique="IndexOpsMixin",
42 duplicated="IndexOpsMixin",
43)
46class PandasObject(DirNamesMixin):
47 """baseclass for various pandas objects"""
49 @property
50 def _constructor(self):
51 """class constructor (for this class it's just `__class__`"""
52 return type(self)
54 def __repr__(self) -> str:
55 """
56 Return a string representation for a particular object.
57 """
58 # Should be overwritten by base classes
59 return object.__repr__(self)
61 def _reset_cache(self, key=None):
62 """
63 Reset cached properties. If ``key`` is passed, only clears that key.
64 """
65 if getattr(self, "_cache", None) is None:
66 return
67 if key is None:
68 self._cache.clear()
69 else:
70 self._cache.pop(key, None)
72 def __sizeof__(self):
73 """
74 Generates the total memory usage for an object that returns
75 either a value or Series of values
76 """
77 if hasattr(self, "memory_usage"):
78 mem = self.memory_usage(deep=True)
79 if not is_scalar(mem):
80 mem = mem.sum()
81 return int(mem)
83 # no memory_usage attribute, so fall back to
84 # object's 'sizeof'
85 return super().__sizeof__()
88class NoNewAttributesMixin:
89 """Mixin which prevents adding new attributes.
91 Prevents additional attributes via xxx.attribute = "something" after a
92 call to `self.__freeze()`. Mainly used to prevent the user from using
93 wrong attributes on an accessor (`Series.cat/.str/.dt`).
95 If you really want to add a new attribute at a later time, you need to use
96 `object.__setattr__(self, key, value)`.
97 """
99 def _freeze(self):
100 """Prevents setting additional attributes"""
101 object.__setattr__(self, "__frozen", True)
103 # prevent adding any attribute via s.xxx.new_attribute = ...
104 def __setattr__(self, key, value):
105 # _cache is used by a decorator
106 # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)
107 # because
108 # 1.) getattr is false for attributes that raise errors
109 # 2.) cls.__dict__ doesn't traverse into base classes
110 if getattr(self, "__frozen", False) and not (
111 key == "_cache"
112 or key in type(self).__dict__
113 or getattr(self, key, None) is not None
114 ):
115 raise AttributeError(f"You cannot add any new attribute '{key}'")
116 object.__setattr__(self, key, value)
119class GroupByError(Exception):
120 pass
123class DataError(GroupByError):
124 pass
127class SpecificationError(GroupByError):
128 pass
131class SelectionMixin:
132 """
133 mixin implementing the selection & aggregation interface on a group-like
134 object sub-classes need to define: obj, exclusions
135 """
137 _selection = None
138 _internal_names = ["_cache", "__setstate__"]
139 _internal_names_set = set(_internal_names)
141 _builtin_table = {builtins.sum: np.sum, builtins.max: np.max, builtins.min: np.min}
143 _cython_table = {
144 builtins.sum: "sum",
145 builtins.max: "max",
146 builtins.min: "min",
147 np.all: "all",
148 np.any: "any",
149 np.sum: "sum",
150 np.nansum: "sum",
151 np.mean: "mean",
152 np.nanmean: "mean",
153 np.prod: "prod",
154 np.nanprod: "prod",
155 np.std: "std",
156 np.nanstd: "std",
157 np.var: "var",
158 np.nanvar: "var",
159 np.median: "median",
160 np.nanmedian: "median",
161 np.max: "max",
162 np.nanmax: "max",
163 np.min: "min",
164 np.nanmin: "min",
165 np.cumprod: "cumprod",
166 np.nancumprod: "cumprod",
167 np.cumsum: "cumsum",
168 np.nancumsum: "cumsum",
169 }
171 @property
172 def _selection_name(self):
173 """
174 return a name for myself; this would ideally be called
175 the 'name' property, but we cannot conflict with the
176 Series.name property which can be set
177 """
178 if self._selection is None:
179 return None # 'result'
180 else:
181 return self._selection
183 @property
184 def _selection_list(self):
185 if not isinstance(
186 self._selection, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)
187 ):
188 return [self._selection]
189 return self._selection
191 @cache_readonly
192 def _selected_obj(self):
194 if self._selection is None or isinstance(self.obj, ABCSeries):
195 return self.obj
196 else:
197 return self.obj[self._selection]
199 @cache_readonly
200 def ndim(self) -> int:
201 return self._selected_obj.ndim
203 @cache_readonly
204 def _obj_with_exclusions(self):
205 if self._selection is not None and isinstance(self.obj, ABCDataFrame):
206 return self.obj.reindex(columns=self._selection_list)
208 if len(self.exclusions) > 0:
209 return self.obj.drop(self.exclusions, axis=1)
210 else:
211 return self.obj
213 def __getitem__(self, key):
214 if self._selection is not None:
215 raise IndexError(f"Column(s) {self._selection} already selected")
217 if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)):
218 if len(self.obj.columns.intersection(key)) != len(key):
219 bad_keys = list(set(key).difference(self.obj.columns))
220 raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")
221 return self._gotitem(list(key), ndim=2)
223 elif not getattr(self, "as_index", False):
224 if key not in self.obj.columns:
225 raise KeyError(f"Column not found: {key}")
226 return self._gotitem(key, ndim=2)
228 else:
229 if key not in self.obj:
230 raise KeyError(f"Column not found: {key}")
231 return self._gotitem(key, ndim=1)
233 def _gotitem(self, key, ndim, subset=None):
234 """
235 sub-classes to define
236 return a sliced object
238 Parameters
239 ----------
240 key : string / list of selections
241 ndim : 1,2
242 requested ndim of result
243 subset : object, default None
244 subset to act on
246 """
247 raise AbstractMethodError(self)
249 def aggregate(self, func, *args, **kwargs):
250 raise AbstractMethodError(self)
252 agg = aggregate
254 def _try_aggregate_string_function(self, arg: str, *args, **kwargs):
255 """
256 if arg is a string, then try to operate on it:
257 - try to find a function (or attribute) on ourselves
258 - try to find a numpy function
259 - raise
261 """
262 assert isinstance(arg, str)
264 f = getattr(self, arg, None)
265 if f is not None:
266 if callable(f):
267 return f(*args, **kwargs)
269 # people may try to aggregate on a non-callable attribute
270 # but don't let them think they can pass args to it
271 assert len(args) == 0
272 assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0
273 return f
275 f = getattr(np, arg, None)
276 if f is not None:
277 if hasattr(self, "__array__"):
278 # in particular exclude Window
279 return f(self, *args, **kwargs)
281 raise AttributeError(
282 f"'{arg}' is not a valid function for '{type(self).__name__}' object"
283 )
285 def _aggregate(self, arg, *args, **kwargs):
286 """
287 provide an implementation for the aggregators
289 Parameters
290 ----------
291 arg : string, dict, function
292 *args : args to pass on to the function
293 **kwargs : kwargs to pass on to the function
295 Returns
296 -------
297 tuple of result, how
299 Notes
300 -----
301 how can be a string describe the required post-processing, or
302 None if not required
303 """
304 is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
306 _axis = kwargs.pop("_axis", None)
307 if _axis is None:
308 _axis = getattr(self, "axis", 0)
310 if isinstance(arg, str):
311 return self._try_aggregate_string_function(arg, *args, **kwargs), None
313 if isinstance(arg, dict):
314 # aggregate based on the passed dict
315 if _axis != 0: # pragma: no cover
316 raise ValueError("Can only pass dict with axis=0")
318 obj = self._selected_obj
320 # if we have a dict of any non-scalars
321 # eg. {'A' : ['mean']}, normalize all to
322 # be list-likes
323 if any(is_aggregator(x) for x in arg.values()):
324 new_arg = {}
325 for k, v in arg.items():
326 if not isinstance(v, (tuple, list, dict)):
327 new_arg[k] = [v]
328 else:
329 new_arg[k] = v
331 # the keys must be in the columns
332 # for ndim=2, or renamers for ndim=1
334 # ok for now, but deprecated
335 # {'A': { 'ra': 'mean' }}
336 # {'A': { 'ra': ['mean'] }}
337 # {'ra': ['mean']}
339 # not ok
340 # {'ra' : { 'A' : 'mean' }}
341 if isinstance(v, dict):
342 raise SpecificationError("nested renamer is not supported")
343 elif isinstance(obj, ABCSeries):
344 raise SpecificationError("nested renamer is not supported")
345 elif isinstance(obj, ABCDataFrame) and k not in obj.columns:
346 raise KeyError(f"Column '{k}' does not exist!")
348 arg = new_arg
350 else:
351 # deprecation of renaming keys
352 # GH 15931
353 keys = list(arg.keys())
354 if isinstance(obj, ABCDataFrame) and len(
355 obj.columns.intersection(keys)
356 ) != len(keys):
357 raise SpecificationError("nested renamer is not supported")
359 from pandas.core.reshape.concat import concat
361 def _agg_1dim(name, how, subset=None):
362 """
363 aggregate a 1-dim with how
364 """
365 colg = self._gotitem(name, ndim=1, subset=subset)
366 if colg.ndim != 1:
367 raise SpecificationError(
368 "nested dictionary is ambiguous in aggregation"
369 )
370 return colg.aggregate(how)
372 def _agg_2dim(name, how):
373 """
374 aggregate a 2-dim with how
375 """
376 colg = self._gotitem(self._selection, ndim=2, subset=obj)
377 return colg.aggregate(how)
379 def _agg(arg, func):
380 """
381 run the aggregations over the arg with func
382 return a dict
383 """
384 result = {}
385 for fname, agg_how in arg.items():
386 result[fname] = func(fname, agg_how)
387 return result
389 # set the final keys
390 keys = list(arg.keys())
391 result = {}
393 if self._selection is not None:
395 sl = set(self._selection_list)
397 # we are a Series like object,
398 # but may have multiple aggregations
399 if len(sl) == 1:
401 result = _agg(
402 arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how)
403 )
405 # we are selecting the same set as we are aggregating
406 elif not len(sl - set(keys)):
408 result = _agg(arg, _agg_1dim)
410 # we are a DataFrame, with possibly multiple aggregations
411 else:
413 result = _agg(arg, _agg_2dim)
415 # no selection
416 else:
418 try:
419 result = _agg(arg, _agg_1dim)
420 except SpecificationError:
422 # we are aggregating expecting all 1d-returns
423 # but we have 2d
424 result = _agg(arg, _agg_2dim)
426 # combine results
428 def is_any_series() -> bool:
429 # return a boolean if we have *any* nested series
430 return any(isinstance(r, ABCSeries) for r in result.values())
432 def is_any_frame() -> bool:
433 # return a boolean if we have *any* nested series
434 return any(isinstance(r, ABCDataFrame) for r in result.values())
436 if isinstance(result, list):
437 return concat(result, keys=keys, axis=1, sort=True), True
439 elif is_any_frame():
440 # we have a dict of DataFrames
441 # return a MI DataFrame
443 return concat([result[k] for k in keys], keys=keys, axis=1), True
445 elif isinstance(self, ABCSeries) and is_any_series():
447 # we have a dict of Series
448 # return a MI Series
449 try:
450 result = concat(result)
451 except TypeError:
452 # we want to give a nice error here if
453 # we have non-same sized objects, so
454 # we don't automatically broadcast
456 raise ValueError(
457 "cannot perform both aggregation "
458 "and transformation operations "
459 "simultaneously"
460 )
462 return result, True
464 # fall thru
465 from pandas import DataFrame, Series
467 try:
468 result = DataFrame(result)
469 except ValueError:
471 # we have a dict of scalars
472 result = Series(result, name=getattr(self, "name", None))
474 return result, True
475 elif is_list_like(arg):
476 # we require a list, but not an 'str'
477 return self._aggregate_multiple_funcs(arg, _axis=_axis), None
478 else:
479 result = None
481 f = self._get_cython_func(arg)
482 if f and not args and not kwargs:
483 return getattr(self, f)(), None
485 # caller can react
486 return result, True
488 def _aggregate_multiple_funcs(self, arg, _axis):
489 from pandas.core.reshape.concat import concat
491 if _axis != 0:
492 raise NotImplementedError("axis other than 0 is not supported")
494 if self._selected_obj.ndim == 1:
495 obj = self._selected_obj
496 else:
497 obj = self._obj_with_exclusions
499 results = []
500 keys = []
502 # degenerate case
503 if obj.ndim == 1:
504 for a in arg:
505 colg = self._gotitem(obj.name, ndim=1, subset=obj)
506 try:
507 new_res = colg.aggregate(a)
509 except TypeError:
510 pass
511 else:
512 results.append(new_res)
514 # make sure we find a good name
515 name = com.get_callable_name(a) or a
516 keys.append(name)
518 # multiples
519 else:
520 for index, col in enumerate(obj):
521 colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index])
522 try:
523 new_res = colg.aggregate(arg)
524 except (TypeError, DataError):
525 pass
526 except ValueError as err:
527 # cannot aggregate
528 if "Must produce aggregated value" in str(err):
529 # raised directly in _aggregate_named
530 pass
531 elif "no results" in str(err):
532 # raised direcly in _aggregate_multiple_funcs
533 pass
534 else:
535 raise
536 else:
537 results.append(new_res)
538 keys.append(col)
540 # if we are empty
541 if not len(results):
542 raise ValueError("no results")
544 try:
545 return concat(results, keys=keys, axis=1, sort=False)
546 except TypeError:
548 # we are concatting non-NDFrame objects,
549 # e.g. a list of scalars
551 from pandas import Series
553 result = Series(results, index=keys, name=self.name)
554 if is_nested_object(result):
555 raise ValueError("cannot combine transform and aggregation operations")
556 return result
558 def _get_cython_func(self, arg: str) -> Optional[str]:
559 """
560 if we define an internal function for this argument, return it
561 """
562 return self._cython_table.get(arg)
564 def _is_builtin_func(self, arg):
565 """
566 if we define an builtin function for this argument, return it,
567 otherwise return the arg
568 """
569 return self._builtin_table.get(arg, arg)
572class ShallowMixin:
573 _attributes: List[str] = []
575 def _shallow_copy(self, obj=None, **kwargs):
576 """
577 return a new object with the replacement attributes
578 """
579 if obj is None:
580 obj = self._selected_obj.copy()
582 if isinstance(obj, self._constructor):
583 obj = obj.obj
584 for attr in self._attributes:
585 if attr not in kwargs:
586 kwargs[attr] = getattr(self, attr)
587 return self._constructor(obj, **kwargs)
590class IndexOpsMixin:
591 """
592 Common ops mixin to support a unified interface / docs for Series / Index
593 """
595 # ndarray compatibility
596 __array_priority__ = 1000
597 _deprecations: FrozenSet[str] = frozenset(
598 ["tolist"] # tolist is not deprecated, just suppressed in the __dir__
599 )
601 def transpose(self, *args, **kwargs):
602 """
603 Return the transpose, which is by definition self.
605 Returns
606 -------
607 %(klass)s
608 """
609 nv.validate_transpose(args, kwargs)
610 return self
612 T = property(
613 transpose,
614 doc="""
615 Return the transpose, which is by definition self.
616 """,
617 )
619 @property
620 def shape(self):
621 """
622 Return a tuple of the shape of the underlying data.
623 """
624 return self._values.shape
626 @property
627 def ndim(self) -> int:
628 """
629 Number of dimensions of the underlying data, by definition 1.
630 """
631 return 1
633 def item(self):
634 """
635 Return the first element of the underlying data as a python scalar.
637 Returns
638 -------
639 scalar
640 The first element of %(klass)s.
642 Raises
643 ------
644 ValueError
645 If the data is not length-1.
646 """
647 if not (
648 is_extension_array_dtype(self.dtype) or needs_i8_conversion(self.dtype)
649 ):
650 # numpy returns ints instead of datetime64/timedelta64 objects,
651 # which we need to wrap in Timestamp/Timedelta/Period regardless.
652 return self.values.item()
654 if len(self) == 1:
655 return next(iter(self))
656 else:
657 raise ValueError("can only convert an array of size 1 to a Python scalar")
659 @property
660 def nbytes(self):
661 """
662 Return the number of bytes in the underlying data.
663 """
664 return self._values.nbytes
666 @property
667 def size(self):
668 """
669 Return the number of elements in the underlying data.
670 """
671 return len(self._values)
673 @property
674 def array(self) -> ExtensionArray:
675 """
676 The ExtensionArray of the data backing this Series or Index.
678 .. versionadded:: 0.24.0
680 Returns
681 -------
682 ExtensionArray
683 An ExtensionArray of the values stored within. For extension
684 types, this is the actual array. For NumPy native types, this
685 is a thin (no copy) wrapper around :class:`numpy.ndarray`.
687 ``.array`` differs ``.values`` which may require converting the
688 data to a different form.
690 See Also
691 --------
692 Index.to_numpy : Similar method that always returns a NumPy array.
693 Series.to_numpy : Similar method that always returns a NumPy array.
695 Notes
696 -----
697 This table lays out the different array types for each extension
698 dtype within pandas.
700 ================== =============================
701 dtype array type
702 ================== =============================
703 category Categorical
704 period PeriodArray
705 interval IntervalArray
706 IntegerNA IntegerArray
707 string StringArray
708 boolean BooleanArray
709 datetime64[ns, tz] DatetimeArray
710 ================== =============================
712 For any 3rd-party extension types, the array type will be an
713 ExtensionArray.
715 For all remaining dtypes ``.array`` will be a
716 :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray
717 stored within. If you absolutely need a NumPy array (possibly with
718 copying / coercing data), then use :meth:`Series.to_numpy` instead.
720 Examples
721 --------
723 For regular NumPy types like int, and float, a PandasArray
724 is returned.
726 >>> pd.Series([1, 2, 3]).array
727 <PandasArray>
728 [1, 2, 3]
729 Length: 3, dtype: int64
731 For extension types, like Categorical, the actual ExtensionArray
732 is returned
734 >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
735 >>> ser.array
736 [a, b, a]
737 Categories (2, object): [a, b]
738 """
739 raise AbstractMethodError(self)
741 def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs):
742 """
743 A NumPy ndarray representing the values in this Series or Index.
745 .. versionadded:: 0.24.0
747 Parameters
748 ----------
749 dtype : str or numpy.dtype, optional
750 The dtype to pass to :meth:`numpy.asarray`.
751 copy : bool, default False
752 Whether to ensure that the returned value is a not a view on
753 another array. Note that ``copy=False`` does not *ensure* that
754 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
755 a copy is made, even if not strictly necessary.
756 na_value : Any, optional
757 The value to use for missing values. The default value depends
758 on `dtype` and the type of the array.
760 .. versionadded:: 1.0.0
762 **kwargs
763 Additional keywords passed through to the ``to_numpy`` method
764 of the underlying array (for extension arrays).
766 .. versionadded:: 1.0.0
768 Returns
769 -------
770 numpy.ndarray
772 See Also
773 --------
774 Series.array : Get the actual data stored within.
775 Index.array : Get the actual data stored within.
776 DataFrame.to_numpy : Similar method for DataFrame.
778 Notes
779 -----
780 The returned array will be the same up to equality (values equal
781 in `self` will be equal in the returned array; likewise for values
782 that are not equal). When `self` contains an ExtensionArray, the
783 dtype may be different. For example, for a category-dtype Series,
784 ``to_numpy()`` will return a NumPy array and the categorical dtype
785 will be lost.
787 For NumPy dtypes, this will be a reference to the actual data stored
788 in this Series or Index (assuming ``copy=False``). Modifying the result
789 in place will modify the data stored in the Series or Index (not that
790 we recommend doing that).
792 For extension types, ``to_numpy()`` *may* require copying data and
793 coercing the result to a NumPy type (possibly object), which may be
794 expensive. When you need a no-copy reference to the underlying data,
795 :attr:`Series.array` should be used instead.
797 This table lays out the different dtypes and default return types of
798 ``to_numpy()`` for various dtypes within pandas.
800 ================== ================================
801 dtype array type
802 ================== ================================
803 category[T] ndarray[T] (same dtype as input)
804 period ndarray[object] (Periods)
805 interval ndarray[object] (Intervals)
806 IntegerNA ndarray[object]
807 datetime64[ns] datetime64[ns]
808 datetime64[ns, tz] ndarray[object] (Timestamps)
809 ================== ================================
811 Examples
812 --------
813 >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
814 >>> ser.to_numpy()
815 array(['a', 'b', 'a'], dtype=object)
817 Specify the `dtype` to control how datetime-aware data is represented.
818 Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
819 objects, each with the correct ``tz``.
821 >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
822 >>> ser.to_numpy(dtype=object)
823 array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
824 Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
825 dtype=object)
827 Or ``dtype='datetime64[ns]'`` to return an ndarray of native
828 datetime64 values. The values are converted to UTC and the timezone
829 info is dropped.
831 >>> ser.to_numpy(dtype="datetime64[ns]")
832 ... # doctest: +ELLIPSIS
833 array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
834 dtype='datetime64[ns]')
835 """
836 if is_extension_array_dtype(self.dtype):
837 return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)
838 else:
839 if kwargs:
840 msg = "to_numpy() got an unexpected keyword argument '{}'".format(
841 list(kwargs.keys())[0]
842 )
843 raise TypeError(msg)
845 result = np.asarray(self._values, dtype=dtype)
846 # TODO(GH-24345): Avoid potential double copy
847 if copy or na_value is not lib.no_default:
848 result = result.copy()
849 if na_value is not lib.no_default:
850 result[self.isna()] = na_value
851 return result
853 @property
854 def _ndarray_values(self) -> np.ndarray:
855 """
856 The data as an ndarray, possibly losing information.
858 The expectation is that this is cheap to compute, and is primarily
859 used for interacting with our indexers.
861 - categorical -> codes
862 """
863 if is_extension_array_dtype(self):
864 return self.array._ndarray_values
865 # As a mixin, we depend on the mixing class having values.
866 # Special mixin syntax may be developed in the future:
867 # https://github.com/python/typing/issues/246
868 return self.values # type: ignore
870 @property
871 def empty(self):
872 return not self.size
874 def max(self, axis=None, skipna=True, *args, **kwargs):
875 """
876 Return the maximum value of the Index.
878 Parameters
879 ----------
880 axis : int, optional
881 For compatibility with NumPy. Only 0 or None are allowed.
882 skipna : bool, default True
884 Returns
885 -------
886 scalar
887 Maximum value.
889 See Also
890 --------
891 Index.min : Return the minimum value in an Index.
892 Series.max : Return the maximum value in a Series.
893 DataFrame.max : Return the maximum values in a DataFrame.
895 Examples
896 --------
897 >>> idx = pd.Index([3, 2, 1])
898 >>> idx.max()
899 3
901 >>> idx = pd.Index(['c', 'b', 'a'])
902 >>> idx.max()
903 'c'
905 For a MultiIndex, the maximum is determined lexicographically.
907 >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
908 >>> idx.max()
909 ('b', 2)
910 """
911 nv.validate_minmax_axis(axis)
912 nv.validate_max(args, kwargs)
913 return nanops.nanmax(self._values, skipna=skipna)
915 def argmax(self, axis=None, skipna=True, *args, **kwargs):
916 """
917 Return an ndarray of the maximum argument indexer.
919 Parameters
920 ----------
921 axis : {None}
922 Dummy argument for consistency with Series.
923 skipna : bool, default True
925 Returns
926 -------
927 numpy.ndarray
928 Indices of the maximum values.
930 See Also
931 --------
932 numpy.ndarray.argmax
933 """
934 nv.validate_minmax_axis(axis)
935 nv.validate_argmax_with_skipna(skipna, args, kwargs)
936 return nanops.nanargmax(self._values, skipna=skipna)
938 def min(self, axis=None, skipna=True, *args, **kwargs):
939 """
940 Return the minimum value of the Index.
942 Parameters
943 ----------
944 axis : {None}
945 Dummy argument for consistency with Series.
946 skipna : bool, default True
948 Returns
949 -------
950 scalar
951 Minimum value.
953 See Also
954 --------
955 Index.max : Return the maximum value of the object.
956 Series.min : Return the minimum value in a Series.
957 DataFrame.min : Return the minimum values in a DataFrame.
959 Examples
960 --------
961 >>> idx = pd.Index([3, 2, 1])
962 >>> idx.min()
963 1
965 >>> idx = pd.Index(['c', 'b', 'a'])
966 >>> idx.min()
967 'a'
969 For a MultiIndex, the minimum is determined lexicographically.
971 >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
972 >>> idx.min()
973 ('a', 1)
974 """
975 nv.validate_minmax_axis(axis)
976 nv.validate_min(args, kwargs)
977 return nanops.nanmin(self._values, skipna=skipna)
979 def argmin(self, axis=None, skipna=True, *args, **kwargs):
980 """
981 Return a ndarray of the minimum argument indexer.
983 Parameters
984 ----------
985 axis : {None}
986 Dummy argument for consistency with Series.
987 skipna : bool, default True
989 Returns
990 -------
991 numpy.ndarray
993 See Also
994 --------
995 numpy.ndarray.argmin
996 """
997 nv.validate_minmax_axis(axis)
998 nv.validate_argmax_with_skipna(skipna, args, kwargs)
999 return nanops.nanargmin(self._values, skipna=skipna)
1001 def tolist(self):
1002 """
1003 Return a list of the values.
1005 These are each a scalar type, which is a Python scalar
1006 (for str, int, float) or a pandas scalar
1007 (for Timestamp/Timedelta/Interval/Period)
1009 Returns
1010 -------
1011 list
1013 See Also
1014 --------
1015 numpy.ndarray.tolist
1016 """
1017 if self.dtype.kind in ["m", "M"]:
1018 return [com.maybe_box_datetimelike(x) for x in self._values]
1019 elif is_extension_array_dtype(self._values):
1020 return list(self._values)
1021 else:
1022 return self._values.tolist()
1024 to_list = tolist
1026 def __iter__(self):
1027 """
1028 Return an iterator of the values.
1030 These are each a scalar type, which is a Python scalar
1031 (for str, int, float) or a pandas scalar
1032 (for Timestamp/Timedelta/Interval/Period)
1034 Returns
1035 -------
1036 iterator
1037 """
1038 # We are explicitly making element iterators.
1039 if self.dtype.kind in ["m", "M"]:
1040 return map(com.maybe_box_datetimelike, self._values)
1041 elif is_extension_array_dtype(self._values):
1042 return iter(self._values)
1043 else:
1044 return map(self._values.item, range(self._values.size))
1046 @cache_readonly
1047 def hasnans(self):
1048 """
1049 Return if I have any nans; enables various perf speedups.
1050 """
1051 return bool(isna(self).any())
1053 def _reduce(
1054 self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
1055 ):
1056 """ perform the reduction type operation if we can """
1057 func = getattr(self, name, None)
1058 if func is None:
1059 raise TypeError(
1060 f"{type(self).__name__} cannot perform the operation {name}"
1061 )
1062 return func(skipna=skipna, **kwds)
1064 def _map_values(self, mapper, na_action=None):
1065 """
1066 An internal function that maps values using the input
1067 correspondence (which can be a dict, Series, or function).
1069 Parameters
1070 ----------
1071 mapper : function, dict, or Series
1072 The input correspondence object
1073 na_action : {None, 'ignore'}
1074 If 'ignore', propagate NA values, without passing them to the
1075 mapping function
1077 Returns
1078 -------
1079 Union[Index, MultiIndex], inferred
1080 The output of the mapping function applied to the index.
1081 If the function returns a tuple with more than one element
1082 a MultiIndex will be returned.
1084 """
1086 # we can fastpath dict/Series to an efficient map
1087 # as we know that we are not going to have to yield
1088 # python types
1089 if is_dict_like(mapper):
1090 if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
1091 # If a dictionary subclass defines a default value method,
1092 # convert mapper to a lookup function (GH #15999).
1093 dict_with_default = mapper
1094 mapper = lambda x: dict_with_default[x]
1095 else:
1096 # Dictionary does not have a default. Thus it's safe to
1097 # convert to an Series for efficiency.
1098 # we specify the keys here to handle the
1099 # possibility that they are tuples
1101 # The return value of mapping with an empty mapper is
1102 # expected to be pd.Series(np.nan, ...). As np.nan is
1103 # of dtype float64 the return value of this method should
1104 # be float64 as well
1105 mapper = create_series_with_explicit_dtype(
1106 mapper, dtype_if_empty=np.float64
1107 )
1109 if isinstance(mapper, ABCSeries):
1110 # Since values were input this means we came from either
1111 # a dict or a series and mapper should be an index
1112 if is_categorical_dtype(self._values):
1113 # use the built in categorical series mapper which saves
1114 # time by mapping the categories instead of all values
1115 return self._values.map(mapper)
1116 if is_extension_array_dtype(self.dtype):
1117 values = self._values
1118 else:
1119 values = self.values
1121 indexer = mapper.index.get_indexer(values)
1122 new_values = algorithms.take_1d(mapper._values, indexer)
1124 return new_values
1126 # we must convert to python types
1127 if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
1128 # GH#23179 some EAs do not have `map`
1129 values = self._values
1130 if na_action is not None:
1131 raise NotImplementedError
1132 map_f = lambda values, f: values.map(f)
1133 else:
1134 values = self.astype(object)
1135 values = getattr(values, "values", values)
1136 if na_action == "ignore":
1138 def map_f(values, f):
1139 return lib.map_infer_mask(values, f, isna(values).view(np.uint8))
1141 else:
1142 map_f = lib.map_infer
1144 # mapper is a function
1145 new_values = map_f(values, mapper)
1147 return new_values
1149 def value_counts(
1150 self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
1151 ):
1152 """
1153 Return a Series containing counts of unique values.
1155 The resulting object will be in descending order so that the
1156 first element is the most frequently-occurring element.
1157 Excludes NA values by default.
1159 Parameters
1160 ----------
1161 normalize : bool, default False
1162 If True then the object returned will contain the relative
1163 frequencies of the unique values.
1164 sort : bool, default True
1165 Sort by frequencies.
1166 ascending : bool, default False
1167 Sort in ascending order.
1168 bins : int, optional
1169 Rather than count values, group them into half-open bins,
1170 a convenience for ``pd.cut``, only works with numeric data.
1171 dropna : bool, default True
1172 Don't include counts of NaN.
1174 Returns
1175 -------
1176 Series
1178 See Also
1179 --------
1180 Series.count: Number of non-NA elements in a Series.
1181 DataFrame.count: Number of non-NA elements in a DataFrame.
1183 Examples
1184 --------
1185 >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
1186 >>> index.value_counts()
1187 3.0 2
1188 4.0 1
1189 2.0 1
1190 1.0 1
1191 dtype: int64
1193 With `normalize` set to `True`, returns the relative frequency by
1194 dividing all values by the sum of values.
1196 >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
1197 >>> s.value_counts(normalize=True)
1198 3.0 0.4
1199 4.0 0.2
1200 2.0 0.2
1201 1.0 0.2
1202 dtype: float64
1204 **bins**
1206 Bins can be useful for going from a continuous variable to a
1207 categorical variable; instead of counting unique
1208 apparitions of values, divide the index in the specified
1209 number of half-open bins.
1211 >>> s.value_counts(bins=3)
1212 (2.0, 3.0] 2
1213 (0.996, 2.0] 2
1214 (3.0, 4.0] 1
1215 dtype: int64
1217 **dropna**
1219 With `dropna` set to `False` we can also see NaN index values.
1221 >>> s.value_counts(dropna=False)
1222 3.0 2
1223 NaN 1
1224 4.0 1
1225 2.0 1
1226 1.0 1
1227 dtype: int64
1228 """
1229 result = value_counts(
1230 self,
1231 sort=sort,
1232 ascending=ascending,
1233 normalize=normalize,
1234 bins=bins,
1235 dropna=dropna,
1236 )
1237 return result
1239 def unique(self):
1240 values = self._values
1242 if hasattr(values, "unique"):
1244 result = values.unique()
1245 else:
1246 result = unique1d(values)
1248 return result
1250 def nunique(self, dropna=True):
1251 """
1252 Return number of unique elements in the object.
1254 Excludes NA values by default.
1256 Parameters
1257 ----------
1258 dropna : bool, default True
1259 Don't include NaN in the count.
1261 Returns
1262 -------
1263 int
1265 See Also
1266 --------
1267 DataFrame.nunique: Method nunique for DataFrame.
1268 Series.count: Count non-NA/null observations in the Series.
1270 Examples
1271 --------
1272 >>> s = pd.Series([1, 3, 5, 7, 7])
1273 >>> s
1274 0 1
1275 1 3
1276 2 5
1277 3 7
1278 4 7
1279 dtype: int64
1281 >>> s.nunique()
1282 4
1283 """
1284 uniqs = self.unique()
1285 n = len(uniqs)
1286 if dropna and isna(uniqs).any():
1287 n -= 1
1288 return n
1290 @property
1291 def is_unique(self):
1292 """
1293 Return boolean if values in the object are unique.
1295 Returns
1296 -------
1297 bool
1298 """
1299 return self.nunique(dropna=False) == len(self)
1301 @property
1302 def is_monotonic(self):
1303 """
1304 Return boolean if values in the object are
1305 monotonic_increasing.
1307 Returns
1308 -------
1309 bool
1310 """
1311 from pandas import Index
1313 return Index(self).is_monotonic
1315 is_monotonic_increasing = is_monotonic
1317 @property
1318 def is_monotonic_decreasing(self) -> bool:
1319 """
1320 Return boolean if values in the object are
1321 monotonic_decreasing.
1323 Returns
1324 -------
1325 bool
1326 """
1327 from pandas import Index
1329 return Index(self).is_monotonic_decreasing
1331 def memory_usage(self, deep=False):
1332 """
1333 Memory usage of the values.
1335 Parameters
1336 ----------
1337 deep : bool
1338 Introspect the data deeply, interrogate
1339 `object` dtypes for system-level memory consumption.
1341 Returns
1342 -------
1343 bytes used
1345 See Also
1346 --------
1347 numpy.ndarray.nbytes
1349 Notes
1350 -----
1351 Memory usage does not include memory consumed by elements that
1352 are not components of the array if deep=False or if used on PyPy
1353 """
1354 if hasattr(self.array, "memory_usage"):
1355 return self.array.memory_usage(deep=deep)
1357 v = self.array.nbytes
1358 if deep and is_object_dtype(self) and not PYPY:
1359 v += lib.memory_usage_of_objects(self._values)
1360 return v
1362 @Substitution(
1363 values="",
1364 order="",
1365 size_hint="",
1366 sort=textwrap.dedent(
1367 """\
1368 sort : bool, default False
1369 Sort `uniques` and shuffle `codes` to maintain the
1370 relationship.
1371 """
1372 ),
1373 )
1374 @Appender(algorithms._shared_docs["factorize"])
1375 def factorize(self, sort=False, na_sentinel=-1):
1376 return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
1378 _shared_docs[
1379 "searchsorted"
1380 ] = """
1381 Find indices where elements should be inserted to maintain order.
1383 Find the indices into a sorted %(klass)s `self` such that, if the
1384 corresponding elements in `value` were inserted before the indices,
1385 the order of `self` would be preserved.
1387 .. note::
1389 The %(klass)s *must* be monotonically sorted, otherwise
1390 wrong locations will likely be returned. Pandas does *not*
1391 check this for you.
1393 Parameters
1394 ----------
1395 value : array_like
1396 Values to insert into `self`.
1397 side : {'left', 'right'}, optional
1398 If 'left', the index of the first suitable location found is given.
1399 If 'right', return the last such index. If there is no suitable
1400 index, return either 0 or N (where N is the length of `self`).
1401 sorter : 1-D array_like, optional
1402 Optional array of integer indices that sort `self` into ascending
1403 order. They are typically the result of ``np.argsort``.
1405 Returns
1406 -------
1407 int or array of int
1408 A scalar or array of insertion points with the
1409 same shape as `value`.
1411 .. versionchanged:: 0.24.0
1412 If `value` is a scalar, an int is now always returned.
1413 Previously, scalar inputs returned an 1-item array for
1414 :class:`Series` and :class:`Categorical`.
1416 See Also
1417 --------
1418 sort_values
1419 numpy.searchsorted
1421 Notes
1422 -----
1423 Binary search is used to find the required insertion points.
1425 Examples
1426 --------
1428 >>> x = pd.Series([1, 2, 3])
1429 >>> x
1430 0 1
1431 1 2
1432 2 3
1433 dtype: int64
1435 >>> x.searchsorted(4)
1436 3
1438 >>> x.searchsorted([0, 4])
1439 array([0, 3])
1441 >>> x.searchsorted([1, 3], side='left')
1442 array([0, 2])
1444 >>> x.searchsorted([1, 3], side='right')
1445 array([1, 3])
1447 >>> x = pd.Categorical(['apple', 'bread', 'bread',
1448 'cheese', 'milk'], ordered=True)
1449 [apple, bread, bread, cheese, milk]
1450 Categories (4, object): [apple < bread < cheese < milk]
1452 >>> x.searchsorted('bread')
1453 1
1455 >>> x.searchsorted(['bread'], side='right')
1456 array([3])
1458 If the values are not monotonically sorted, wrong locations
1459 may be returned:
1461 >>> x = pd.Series([2, 1, 3])
1462 >>> x.searchsorted(1)
1463 0 # wrong result, correct would be 1
1464 """
1466 @Substitution(klass="Index")
1467 @Appender(_shared_docs["searchsorted"])
1468 def searchsorted(self, value, side="left", sorter=None):
1469 return algorithms.searchsorted(self._values, value, side=side, sorter=sorter)
1471 def drop_duplicates(self, keep="first", inplace=False):
1472 inplace = validate_bool_kwarg(inplace, "inplace")
1473 if isinstance(self, ABCIndexClass):
1474 if self.is_unique:
1475 return self._shallow_copy()
1477 duplicated = self.duplicated(keep=keep)
1478 result = self[np.logical_not(duplicated)]
1479 if inplace:
1480 return self._update_inplace(result)
1481 else:
1482 return result
1484 def duplicated(self, keep="first"):
1485 if isinstance(self, ABCIndexClass):
1486 if self.is_unique:
1487 return np.zeros(len(self), dtype=np.bool)
1488 return duplicated(self, keep=keep)
1489 else:
1490 return self._constructor(
1491 duplicated(self, keep=keep), index=self.index
1492 ).__finalize__(self)
1494 # ----------------------------------------------------------------------
1495 # abstracts
1497 def _update_inplace(self, result, verify_is_copy=True, **kwargs):
1498 raise AbstractMethodError(self)