Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/generic.py : 20%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import collections
2from datetime import timedelta
3import functools
4import gc
5import json
6import operator
7import pickle
8import re
9from textwrap import dedent
10from typing import (
11 Any,
12 Callable,
13 Dict,
14 FrozenSet,
15 Hashable,
16 List,
17 Mapping,
18 Optional,
19 Sequence,
20 Set,
21 Tuple,
22 Type,
23 Union,
24)
25import warnings
26import weakref
28import numpy as np
30from pandas._config import config
32from pandas._libs import Timestamp, iNaT, lib, properties
33from pandas._typing import (
34 Axis,
35 Dtype,
36 FilePathOrBuffer,
37 FrameOrSeries,
38 JSONSerializable,
39 Level,
40 Renamer,
41)
42from pandas.compat import set_function_name
43from pandas.compat._optional import import_optional_dependency
44from pandas.compat.numpy import function as nv
45from pandas.errors import AbstractMethodError
46from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature
47from pandas.util._validators import (
48 validate_bool_kwarg,
49 validate_fillna_kwargs,
50 validate_percentile,
51)
53from pandas.core.dtypes.common import (
54 ensure_int64,
55 ensure_object,
56 ensure_str,
57 is_bool,
58 is_bool_dtype,
59 is_datetime64_any_dtype,
60 is_datetime64tz_dtype,
61 is_dict_like,
62 is_extension_array_dtype,
63 is_float,
64 is_integer,
65 is_list_like,
66 is_number,
67 is_numeric_dtype,
68 is_object_dtype,
69 is_period_arraylike,
70 is_re_compilable,
71 is_scalar,
72 is_timedelta64_dtype,
73 pandas_dtype,
74)
75from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
76from pandas.core.dtypes.inference import is_hashable
77from pandas.core.dtypes.missing import isna, notna
79import pandas as pd
80from pandas.core import missing, nanops
81import pandas.core.algorithms as algos
82from pandas.core.base import PandasObject, SelectionMixin
83import pandas.core.common as com
84from pandas.core.construction import create_series_with_explicit_dtype
85from pandas.core.indexes.api import (
86 Index,
87 InvalidIndexError,
88 MultiIndex,
89 RangeIndex,
90 ensure_index,
91)
92from pandas.core.indexes.datetimes import DatetimeIndex
93from pandas.core.indexes.period import Period, PeriodIndex
94import pandas.core.indexing as indexing
95from pandas.core.internals import BlockManager
96from pandas.core.missing import find_valid_index
97from pandas.core.ops import _align_method_FRAME
99from pandas.io.formats import format as fmt
100from pandas.io.formats.format import DataFrameFormatter, format_percentiles
101from pandas.io.formats.printing import pprint_thing
102from pandas.tseries.frequencies import to_offset
104# goal is to be able to define the docs close to function, while still being
105# able to share
106_shared_docs: Dict[str, str] = dict()
107_shared_doc_kwargs = dict(
108 axes="keywords for axes",
109 klass="Series/DataFrame",
110 axes_single_arg="int or labels for object",
111 args_transpose="axes to permute (int or label for object)",
112 optional_by="""
113 by : str or list of str
114 Name or list of names to sort by""",
115)
118def _single_replace(self, to_replace, method, inplace, limit):
119 """
120 Replaces values in a Series using the fill method specified when no
121 replacement value is given in the replace method
122 """
123 if self.ndim != 1:
124 raise TypeError(
125 f"cannot replace {to_replace} with method {method} on a "
126 f"{type(self).__name__}"
127 )
129 orig_dtype = self.dtype
130 result = self if inplace else self.copy()
131 fill_f = missing.get_fill_func(method)
133 mask = missing.mask_missing(result.values, to_replace)
134 values = fill_f(result.values, limit=limit, mask=mask)
136 if values.dtype == orig_dtype and inplace:
137 return
139 result = pd.Series(values, index=self.index, dtype=self.dtype).__finalize__(self)
141 if inplace:
142 self._update_inplace(result._data)
143 return
145 return result
148bool_t = bool # Need alias because NDFrame has def bool:
151class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin):
152 """
153 N-dimensional analogue of DataFrame. Store multi-dimensional in a
154 size-mutable, labeled data structure
156 Parameters
157 ----------
158 data : BlockManager
159 axes : list
160 copy : bool, default False
161 """
163 _internal_names: List[str] = [
164 "_data",
165 "_cacher",
166 "_item_cache",
167 "_cache",
168 "_is_copy",
169 "_subtyp",
170 "_name",
171 "_index",
172 "_default_kind",
173 "_default_fill_value",
174 "_metadata",
175 "__array_struct__",
176 "__array_interface__",
177 ]
178 _internal_names_set: Set[str] = set(_internal_names)
179 _accessors: Set[str] = set()
180 _deprecations: FrozenSet[str] = frozenset(["get_values", "ix"])
181 _metadata: List[str] = []
182 _is_copy = None
183 _data: BlockManager
184 _attrs: Dict[Optional[Hashable], Any]
185 _typ: str
187 # ----------------------------------------------------------------------
188 # Constructors
190 def __init__(
191 self,
192 data: BlockManager,
193 axes: Optional[List[Index]] = None,
194 copy: bool = False,
195 dtype: Optional[Dtype] = None,
196 attrs: Optional[Mapping[Optional[Hashable], Any]] = None,
197 fastpath: bool = False,
198 ):
200 if not fastpath:
201 if dtype is not None:
202 data = data.astype(dtype)
203 elif copy:
204 data = data.copy()
206 if axes is not None:
207 for i, ax in enumerate(axes):
208 data = data.reindex_axis(ax, axis=i)
210 object.__setattr__(self, "_is_copy", None)
211 object.__setattr__(self, "_data", data)
212 object.__setattr__(self, "_item_cache", {})
213 if attrs is None:
214 attrs = {}
215 else:
216 attrs = dict(attrs)
217 object.__setattr__(self, "_attrs", attrs)
219 def _init_mgr(self, mgr, axes=None, dtype=None, copy=False):
220 """ passed a manager and a axes dict """
221 for a, axe in axes.items():
222 if axe is not None:
223 mgr = mgr.reindex_axis(
224 axe, axis=self._get_block_manager_axis(a), copy=False
225 )
227 # make a copy if explicitly requested
228 if copy:
229 mgr = mgr.copy()
230 if dtype is not None:
231 # avoid further copies if we can
232 if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype:
233 mgr = mgr.astype(dtype=dtype)
234 return mgr
236 # ----------------------------------------------------------------------
238 @property
239 def attrs(self) -> Dict[Optional[Hashable], Any]:
240 """
241 Dictionary of global attributes on this object.
243 .. warning::
245 attrs is experimental and may change without warning.
246 """
247 if self._attrs is None:
248 self._attrs = {}
249 return self._attrs
251 @attrs.setter
252 def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None:
253 self._attrs = dict(value)
255 def _validate_dtype(self, dtype):
256 """ validate the passed dtype """
258 if dtype is not None:
259 dtype = pandas_dtype(dtype)
261 # a compound dtype
262 if dtype.kind == "V":
263 raise NotImplementedError(
264 "compound dtypes are not implemented"
265 f" in the {type(self).__name__} constructor"
266 )
268 return dtype
270 # ----------------------------------------------------------------------
271 # Construction
273 @property
274 def _constructor(self: FrameOrSeries) -> Type[FrameOrSeries]:
275 """Used when a manipulation result has the same dimensions as the
276 original.
277 """
278 raise AbstractMethodError(self)
280 @property
281 def _constructor_sliced(self):
282 """Used when a manipulation result has one lower dimension(s) as the
283 original, such as DataFrame single columns slicing.
284 """
285 raise AbstractMethodError(self)
287 @property
288 def _constructor_expanddim(self):
289 """Used when a manipulation result has one higher dimension as the
290 original, such as Series.to_frame()
291 """
292 raise NotImplementedError
294 # ----------------------------------------------------------------------
295 # Axis
296 _AXIS_ALIASES = {"rows": 0}
297 _AXIS_IALIASES = {0: "rows"}
298 _stat_axis_number = 0
299 _stat_axis_name = "index"
300 _ix = None
301 _AXIS_ORDERS: List[str]
302 _AXIS_NUMBERS: Dict[str, int]
303 _AXIS_NAMES: Dict[int, str]
304 _AXIS_REVERSED: bool
305 _info_axis_number: int
306 _info_axis_name: str
307 _AXIS_LEN: int
309 @classmethod
310 def _setup_axes(cls, axes: List[str], docs: Dict[str, str]) -> None:
311 """
312 Provide axes setup for the major PandasObjects.
314 Parameters
315 ----------
316 axes : the names of the axes in order (lowest to highest)
317 docs : docstrings for the axis properties
318 """
319 info_axis = len(axes) - 1
320 axes_are_reversed = len(axes) > 1
322 cls._AXIS_ORDERS = axes
323 cls._AXIS_NUMBERS = {a: i for i, a in enumerate(axes)}
324 cls._AXIS_LEN = len(axes)
325 cls._AXIS_NAMES = dict(enumerate(axes))
326 cls._AXIS_REVERSED = axes_are_reversed
328 cls._info_axis_number = info_axis
329 cls._info_axis_name = axes[info_axis]
331 # setup the actual axis
332 def set_axis(a, i):
333 setattr(cls, a, properties.AxisProperty(i, docs.get(a, a)))
334 cls._internal_names_set.add(a)
336 if axes_are_reversed:
337 for i, a in cls._AXIS_NAMES.items():
338 set_axis(a, 1 - i)
339 else:
340 for i, a in cls._AXIS_NAMES.items():
341 set_axis(a, i)
343 def _construct_axes_dict(self, axes=None, **kwargs):
344 """Return an axes dictionary for myself."""
345 d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
346 d.update(kwargs)
347 return d
349 @staticmethod
350 def _construct_axes_dict_from(self, axes, **kwargs):
351 """Return an axes dictionary for the passed axes."""
352 d = {a: ax for a, ax in zip(self._AXIS_ORDERS, axes)}
353 d.update(kwargs)
354 return d
356 def _construct_axes_from_arguments(
357 self, args, kwargs, require_all: bool = False, sentinel=None
358 ):
359 """Construct and returns axes if supplied in args/kwargs.
361 If require_all, raise if all axis arguments are not supplied
362 return a tuple of (axes, kwargs).
364 sentinel specifies the default parameter when an axis is not
365 supplied; useful to distinguish when a user explicitly passes None
366 in scenarios where None has special meaning.
367 """
369 # construct the args
370 args = list(args)
371 for a in self._AXIS_ORDERS:
373 # look for a argument by position
374 if a not in kwargs:
375 try:
376 kwargs[a] = args.pop(0)
377 except IndexError:
378 if require_all:
379 raise TypeError("not enough/duplicate arguments specified!")
381 axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS}
382 return axes, kwargs
384 @classmethod
385 def _from_axes(cls: Type[FrameOrSeries], data, axes, **kwargs) -> FrameOrSeries:
386 # for construction from BlockManager
387 if isinstance(data, BlockManager):
388 return cls(data, **kwargs)
389 else:
390 if cls._AXIS_REVERSED:
391 axes = axes[::-1]
392 d = cls._construct_axes_dict_from(cls, axes, copy=False)
393 d.update(kwargs)
394 return cls(data, **d)
396 @classmethod
397 def _get_axis_number(cls, axis):
398 axis = cls._AXIS_ALIASES.get(axis, axis)
399 if is_integer(axis):
400 if axis in cls._AXIS_NAMES:
401 return axis
402 else:
403 try:
404 return cls._AXIS_NUMBERS[axis]
405 except KeyError:
406 pass
407 raise ValueError(f"No axis named {axis} for object type {cls}")
409 @classmethod
410 def _get_axis_name(cls, axis):
411 axis = cls._AXIS_ALIASES.get(axis, axis)
412 if isinstance(axis, str):
413 if axis in cls._AXIS_NUMBERS:
414 return axis
415 else:
416 try:
417 return cls._AXIS_NAMES[axis]
418 except KeyError:
419 pass
420 raise ValueError(f"No axis named {axis} for object type {cls}")
422 def _get_axis(self, axis):
423 name = self._get_axis_name(axis)
424 return getattr(self, name)
426 @classmethod
427 def _get_block_manager_axis(cls, axis):
428 """Map the axis to the block_manager axis."""
429 axis = cls._get_axis_number(axis)
430 if cls._AXIS_REVERSED:
431 m = cls._AXIS_LEN - 1
432 return m - axis
433 return axis
435 def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]:
436 # index or columns
437 axis_index = getattr(self, axis)
438 d = dict()
439 prefix = axis[0]
441 for i, name in enumerate(axis_index.names):
442 if name is not None:
443 key = level = name
444 else:
445 # prefix with 'i' or 'c' depending on the input axis
446 # e.g., you must do ilevel_0 for the 0th level of an unnamed
447 # multiiindex
448 key = f"{prefix}level_{i}"
449 level = i
451 level_values = axis_index.get_level_values(level)
452 s = level_values.to_series()
453 s.index = axis_index
454 d[key] = s
456 # put the index/columns itself in the dict
457 if isinstance(axis_index, MultiIndex):
458 dindex = axis_index
459 else:
460 dindex = axis_index.to_series()
462 d[axis] = dindex
463 return d
465 def _get_index_resolvers(self) -> Dict[str, ABCSeries]:
466 from pandas.core.computation.parsing import clean_column_name
468 d: Dict[str, ABCSeries] = {}
469 for axis_name in self._AXIS_ORDERS:
470 d.update(self._get_axis_resolvers(axis_name))
472 return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
474 def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]:
475 """
476 Return the special character free column resolvers of a dataframe.
478 Column names with special characters are 'cleaned up' so that they can
479 be referred to by backtick quoting.
480 Used in :meth:`DataFrame.eval`.
481 """
482 from pandas.core.computation.parsing import clean_column_name
484 if isinstance(self, ABCSeries):
485 return {clean_column_name(self.name): self}
487 return {
488 clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)
489 }
491 @property
492 def _info_axis(self):
493 return getattr(self, self._info_axis_name)
495 @property
496 def _stat_axis(self):
497 return getattr(self, self._stat_axis_name)
499 @property
500 def shape(self) -> Tuple[int, ...]:
501 """
502 Return a tuple of axis dimensions
503 """
504 return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
506 @property
507 def axes(self) -> List[Index]:
508 """
509 Return index label(s) of the internal NDFrame
510 """
511 # we do it this way because if we have reversed axes, then
512 # the block manager shows then reversed
513 return [self._get_axis(a) for a in self._AXIS_ORDERS]
515 @property
516 def ndim(self) -> int:
517 """
518 Return an int representing the number of axes / array dimensions.
520 Return 1 if Series. Otherwise return 2 if DataFrame.
522 See Also
523 --------
524 ndarray.ndim : Number of array dimensions.
526 Examples
527 --------
528 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
529 >>> s.ndim
530 1
532 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
533 >>> df.ndim
534 2
535 """
536 return self._data.ndim
538 @property
539 def size(self):
540 """
541 Return an int representing the number of elements in this object.
543 Return the number of rows if Series. Otherwise return the number of
544 rows times number of columns if DataFrame.
546 See Also
547 --------
548 ndarray.size : Number of elements in the array.
550 Examples
551 --------
552 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
553 >>> s.size
554 3
556 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
557 >>> df.size
558 4
559 """
560 return np.prod(self.shape)
562 @property
563 def _selected_obj(self: FrameOrSeries) -> FrameOrSeries:
564 """ internal compat with SelectionMixin """
565 return self
567 @property
568 def _obj_with_exclusions(self: FrameOrSeries) -> FrameOrSeries:
569 """ internal compat with SelectionMixin """
570 return self
572 def set_axis(self, labels, axis=0, inplace=False):
573 """
574 Assign desired index to given axis.
576 Indexes for column or row labels can be changed by assigning
577 a list-like or Index.
579 .. versionchanged:: 0.21.0
581 The signature is now `labels` and `axis`, consistent with
582 the rest of pandas API. Previously, the `axis` and `labels`
583 arguments were respectively the first and second positional
584 arguments.
586 Parameters
587 ----------
588 labels : list-like, Index
589 The values for the new index.
591 axis : {0 or 'index', 1 or 'columns'}, default 0
592 The axis to update. The value 0 identifies the rows, and 1
593 identifies the columns.
595 inplace : bool, default False
596 Whether to return a new %(klass)s instance.
598 Returns
599 -------
600 renamed : %(klass)s or None
601 An object of same type as caller if inplace=False, None otherwise.
603 See Also
604 --------
605 DataFrame.rename_axis : Alter the name of the index or columns.
607 Examples
608 --------
609 **Series**
611 >>> s = pd.Series([1, 2, 3])
612 >>> s
613 0 1
614 1 2
615 2 3
616 dtype: int64
618 >>> s.set_axis(['a', 'b', 'c'], axis=0)
619 a 1
620 b 2
621 c 3
622 dtype: int64
624 **DataFrame**
626 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
628 Change the row labels.
630 >>> df.set_axis(['a', 'b', 'c'], axis='index')
631 A B
632 a 1 4
633 b 2 5
634 c 3 6
636 Change the column labels.
638 >>> df.set_axis(['I', 'II'], axis='columns')
639 I II
640 0 1 4
641 1 2 5
642 2 3 6
644 Now, update the labels inplace.
646 >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True)
647 >>> df
648 i ii
649 0 1 4
650 1 2 5
651 2 3 6
652 """
653 if inplace:
654 setattr(self, self._get_axis_name(axis), labels)
655 else:
656 obj = self.copy()
657 obj.set_axis(labels, axis=axis, inplace=True)
658 return obj
660 def _set_axis(self, axis, labels) -> None:
661 self._data.set_axis(axis, labels)
662 self._clear_item_cache()
664 def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries:
665 """
666 Interchange axes and swap values axes appropriately.
668 Returns
669 -------
670 y : same as input
671 """
672 i = self._get_axis_number(axis1)
673 j = self._get_axis_number(axis2)
675 if i == j:
676 if copy:
677 return self.copy()
678 return self
680 mapping = {i: j, j: i}
682 new_axes = (self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN))
683 new_values = self.values.swapaxes(i, j)
684 if copy:
685 new_values = new_values.copy()
687 return self._constructor(new_values, *new_axes).__finalize__(self)
689 def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries:
690 """
691 Return DataFrame with requested index / column level(s) removed.
693 .. versionadded:: 0.24.0
695 Parameters
696 ----------
697 level : int, str, or list-like
698 If a string is given, must be the name of a level
699 If list-like, elements must be names or positional indexes
700 of levels.
702 axis : {0 or 'index', 1 or 'columns'}, default 0
704 Returns
705 -------
706 DataFrame
707 DataFrame with requested index / column level(s) removed.
709 Examples
710 --------
711 >>> df = pd.DataFrame([
712 ... [1, 2, 3, 4],
713 ... [5, 6, 7, 8],
714 ... [9, 10, 11, 12]
715 ... ]).set_index([0, 1]).rename_axis(['a', 'b'])
717 >>> df.columns = pd.MultiIndex.from_tuples([
718 ... ('c', 'e'), ('d', 'f')
719 ... ], names=['level_1', 'level_2'])
721 >>> df
722 level_1 c d
723 level_2 e f
724 a b
725 1 2 3 4
726 5 6 7 8
727 9 10 11 12
729 >>> df.droplevel('a')
730 level_1 c d
731 level_2 e f
732 b
733 2 3 4
734 6 7 8
735 10 11 12
737 >>> df.droplevel('level2', axis=1)
738 level_1 c d
739 a b
740 1 2 3 4
741 5 6 7 8
742 9 10 11 12
743 """
744 labels = self._get_axis(axis)
745 new_labels = labels.droplevel(level)
746 result = self.set_axis(new_labels, axis=axis, inplace=False)
747 return result
749 def pop(self: FrameOrSeries, item) -> FrameOrSeries:
750 """
751 Return item and drop from frame. Raise KeyError if not found.
753 Parameters
754 ----------
755 item : str
756 Label of column to be popped.
758 Returns
759 -------
760 Series
762 Examples
763 --------
764 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
765 ... ('parrot', 'bird', 24.0),
766 ... ('lion', 'mammal', 80.5),
767 ... ('monkey', 'mammal', np.nan)],
768 ... columns=('name', 'class', 'max_speed'))
769 >>> df
770 name class max_speed
771 0 falcon bird 389.0
772 1 parrot bird 24.0
773 2 lion mammal 80.5
774 3 monkey mammal NaN
776 >>> df.pop('class')
777 0 bird
778 1 bird
779 2 mammal
780 3 mammal
781 Name: class, dtype: object
783 >>> df
784 name max_speed
785 0 falcon 389.0
786 1 parrot 24.0
787 2 lion 80.5
788 3 monkey NaN
789 """
790 result = self[item]
791 del self[item]
792 try:
793 result._reset_cacher()
794 except AttributeError:
795 pass
797 return result
799 def squeeze(self, axis=None):
800 """
801 Squeeze 1 dimensional axis objects into scalars.
803 Series or DataFrames with a single element are squeezed to a scalar.
804 DataFrames with a single column or a single row are squeezed to a
805 Series. Otherwise the object is unchanged.
807 This method is most useful when you don't know if your
808 object is a Series or DataFrame, but you do know it has just a single
809 column. In that case you can safely call `squeeze` to ensure you have a
810 Series.
812 Parameters
813 ----------
814 axis : {0 or 'index', 1 or 'columns', None}, default None
815 A specific axis to squeeze. By default, all length-1 axes are
816 squeezed.
818 Returns
819 -------
820 DataFrame, Series, or scalar
821 The projection after squeezing `axis` or all the axes.
823 See Also
824 --------
825 Series.iloc : Integer-location based indexing for selecting scalars.
826 DataFrame.iloc : Integer-location based indexing for selecting Series.
827 Series.to_frame : Inverse of DataFrame.squeeze for a
828 single-column DataFrame.
830 Examples
831 --------
832 >>> primes = pd.Series([2, 3, 5, 7])
834 Slicing might produce a Series with a single value:
836 >>> even_primes = primes[primes % 2 == 0]
837 >>> even_primes
838 0 2
839 dtype: int64
841 >>> even_primes.squeeze()
842 2
844 Squeezing objects with more than one value in every axis does nothing:
846 >>> odd_primes = primes[primes % 2 == 1]
847 >>> odd_primes
848 1 3
849 2 5
850 3 7
851 dtype: int64
853 >>> odd_primes.squeeze()
854 1 3
855 2 5
856 3 7
857 dtype: int64
859 Squeezing is even more effective when used with DataFrames.
861 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
862 >>> df
863 a b
864 0 1 2
865 1 3 4
867 Slicing a single column will produce a DataFrame with the columns
868 having only one value:
870 >>> df_a = df[['a']]
871 >>> df_a
872 a
873 0 1
874 1 3
876 So the columns can be squeezed down, resulting in a Series:
878 >>> df_a.squeeze('columns')
879 0 1
880 1 3
881 Name: a, dtype: int64
883 Slicing a single row from a single column will produce a single
884 scalar DataFrame:
886 >>> df_0a = df.loc[df.index < 1, ['a']]
887 >>> df_0a
888 a
889 0 1
891 Squeezing the rows produces a single scalar Series:
893 >>> df_0a.squeeze('rows')
894 a 1
895 Name: 0, dtype: int64
897 Squeezing all axes will project directly into a scalar:
899 >>> df_0a.squeeze()
900 1
901 """
902 axis = self._AXIS_NAMES if axis is None else (self._get_axis_number(axis),)
903 return self.iloc[
904 tuple(
905 0 if i in axis and len(a) == 1 else slice(None)
906 for i, a in enumerate(self.axes)
907 )
908 ]
910 def swaplevel(self: FrameOrSeries, i=-2, j=-1, axis=0) -> FrameOrSeries:
911 """
912 Swap levels i and j in a MultiIndex on a particular axis
914 Parameters
915 ----------
916 i, j : int, str (can be mixed)
917 Level of index to be swapped. Can pass level name as string.
919 Returns
920 -------
921 swapped : same type as caller (new object)
922 """
923 axis = self._get_axis_number(axis)
924 result = self.copy()
925 labels = result._data.axes[axis]
926 result._data.set_axis(axis, labels.swaplevel(i, j))
927 return result
929 # ----------------------------------------------------------------------
930 # Rename
932 def rename(
933 self: FrameOrSeries,
934 mapper: Optional[Renamer] = None,
935 *,
936 index: Optional[Renamer] = None,
937 columns: Optional[Renamer] = None,
938 axis: Optional[Axis] = None,
939 copy: bool = True,
940 inplace: bool = False,
941 level: Optional[Level] = None,
942 errors: str = "ignore",
943 ) -> Optional[FrameOrSeries]:
944 """
945 Alter axes input function or functions. Function / dict values must be
946 unique (1-to-1). Labels not contained in a dict / Series will be left
947 as-is. Extra labels listed don't throw an error. Alternatively, change
948 ``Series.name`` with a scalar value (Series only).
950 Parameters
951 ----------
952 %(axes)s : scalar, list-like, dict-like or function, optional
953 Scalar or list-like will alter the ``Series.name`` attribute,
954 and raise on DataFrame.
955 dict-like or functions are transformations to apply to
956 that axis' values
957 copy : bool, default True
958 Also copy underlying data.
959 inplace : bool, default False
960 Whether to return a new %(klass)s. If True then value of copy is
961 ignored.
962 level : int or level name, default None
963 In case of a MultiIndex, only rename labels in the specified
964 level.
965 errors : {'ignore', 'raise'}, default 'ignore'
966 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
967 or `columns` contains labels that are not present in the Index
968 being transformed.
969 If 'ignore', existing keys will be renamed and extra keys will be
970 ignored.
972 Returns
973 -------
974 renamed : %(klass)s (new object)
976 Raises
977 ------
978 KeyError
979 If any of the labels is not found in the selected axis and
980 "errors='raise'".
982 See Also
983 --------
984 NDFrame.rename_axis
986 Examples
987 --------
989 >>> s = pd.Series([1, 2, 3])
990 >>> s
991 0 1
992 1 2
993 2 3
994 dtype: int64
995 >>> s.rename("my_name") # scalar, changes Series.name
996 0 1
997 1 2
998 2 3
999 Name: my_name, dtype: int64
1000 >>> s.rename(lambda x: x ** 2) # function, changes labels
1001 0 1
1002 1 2
1003 4 3
1004 dtype: int64
1005 >>> s.rename({1: 3, 2: 5}) # mapping, changes labels
1006 0 1
1007 3 2
1008 5 3
1009 dtype: int64
1011 Since ``DataFrame`` doesn't have a ``.name`` attribute,
1012 only mapping-type arguments are allowed.
1014 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
1015 >>> df.rename(2)
1016 Traceback (most recent call last):
1017 ...
1018 TypeError: 'int' object is not callable
1020 ``DataFrame.rename`` supports two calling conventions
1022 * ``(index=index_mapper, columns=columns_mapper, ...)``
1023 * ``(mapper, axis={'index', 'columns'}, ...)``
1025 We *highly* recommend using keyword arguments to clarify your
1026 intent.
1028 >>> df.rename(index=str, columns={"A": "a", "B": "c"})
1029 a c
1030 0 1 4
1031 1 2 5
1032 2 3 6
1034 >>> df.rename(index=str, columns={"A": "a", "C": "c"})
1035 a B
1036 0 1 4
1037 1 2 5
1038 2 3 6
1040 Using axis-style parameters
1042 >>> df.rename(str.lower, axis='columns')
1043 a b
1044 0 1 4
1045 1 2 5
1046 2 3 6
1048 >>> df.rename({1: 2, 2: 4}, axis='index')
1049 A B
1050 0 1 4
1051 2 2 5
1052 4 3 6
1054 See the :ref:`user guide <basics.rename>` for more.
1055 """
1056 if mapper is None and index is None and columns is None:
1057 raise TypeError("must pass an index to rename")
1059 if index is not None or columns is not None:
1060 if axis is not None:
1061 raise TypeError(
1062 "Cannot specify both 'axis' and any of 'index' or 'columns'"
1063 )
1064 elif mapper is not None:
1065 raise TypeError(
1066 "Cannot specify both 'mapper' and any of 'index' or 'columns'"
1067 )
1068 else:
1069 # use the mapper argument
1070 if axis and self._get_axis_number(axis) == 1:
1071 columns = mapper
1072 else:
1073 index = mapper
1075 result = self if inplace else self.copy(deep=copy)
1077 for axis_no, replacements in enumerate((index, columns)):
1078 if replacements is None:
1079 continue
1081 ax = self._get_axis(axis_no)
1082 baxis = self._get_block_manager_axis(axis_no)
1083 f = com.get_rename_function(replacements)
1085 if level is not None:
1086 level = ax._get_level_number(level)
1088 # GH 13473
1089 if not callable(replacements):
1090 indexer = ax.get_indexer_for(replacements)
1091 if errors == "raise" and len(indexer[indexer == -1]):
1092 missing_labels = [
1093 label
1094 for index, label in enumerate(replacements)
1095 if indexer[index] == -1
1096 ]
1097 raise KeyError(f"{missing_labels} not found in axis")
1099 result._data = result._data.rename_axis(
1100 f, axis=baxis, copy=copy, level=level
1101 )
1102 result._clear_item_cache()
1104 if inplace:
1105 self._update_inplace(result._data)
1106 return None
1107 else:
1108 return result.__finalize__(self)
1110 @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)])
1111 def rename_axis(self, mapper=lib.no_default, **kwargs):
1112 """
1113 Set the name of the axis for the index or columns.
1115 Parameters
1116 ----------
1117 mapper : scalar, list-like, optional
1118 Value to set the axis name attribute.
1119 index, columns : scalar, list-like, dict-like or function, optional
1120 A scalar, list-like, dict-like or functions transformations to
1121 apply to that axis' values.
1123 Use either ``mapper`` and ``axis`` to
1124 specify the axis to target with ``mapper``, or ``index``
1125 and/or ``columns``.
1127 .. versionchanged:: 0.24.0
1129 axis : {0 or 'index', 1 or 'columns'}, default 0
1130 The axis to rename.
1131 copy : bool, default True
1132 Also copy underlying data.
1133 inplace : bool, default False
1134 Modifies the object directly, instead of creating a new Series
1135 or DataFrame.
1137 Returns
1138 -------
1139 Series, DataFrame, or None
1140 The same type as the caller or None if `inplace` is True.
1142 See Also
1143 --------
1144 Series.rename : Alter Series index labels or name.
1145 DataFrame.rename : Alter DataFrame index labels or name.
1146 Index.rename : Set new names on index.
1148 Notes
1149 -----
1150 ``DataFrame.rename_axis`` supports two calling conventions
1152 * ``(index=index_mapper, columns=columns_mapper, ...)``
1153 * ``(mapper, axis={'index', 'columns'}, ...)``
1155 The first calling convention will only modify the names of
1156 the index and/or the names of the Index object that is the columns.
1157 In this case, the parameter ``copy`` is ignored.
1159 The second calling convention will modify the names of the
1160 the corresponding index if mapper is a list or a scalar.
1161 However, if mapper is dict-like or a function, it will use the
1162 deprecated behavior of modifying the axis *labels*.
1164 We *highly* recommend using keyword arguments to clarify your
1165 intent.
1167 Examples
1168 --------
1169 **Series**
1171 >>> s = pd.Series(["dog", "cat", "monkey"])
1172 >>> s
1173 0 dog
1174 1 cat
1175 2 monkey
1176 dtype: object
1177 >>> s.rename_axis("animal")
1178 animal
1179 0 dog
1180 1 cat
1181 2 monkey
1182 dtype: object
1184 **DataFrame**
1186 >>> df = pd.DataFrame({"num_legs": [4, 4, 2],
1187 ... "num_arms": [0, 0, 2]},
1188 ... ["dog", "cat", "monkey"])
1189 >>> df
1190 num_legs num_arms
1191 dog 4 0
1192 cat 4 0
1193 monkey 2 2
1194 >>> df = df.rename_axis("animal")
1195 >>> df
1196 num_legs num_arms
1197 animal
1198 dog 4 0
1199 cat 4 0
1200 monkey 2 2
1201 >>> df = df.rename_axis("limbs", axis="columns")
1202 >>> df
1203 limbs num_legs num_arms
1204 animal
1205 dog 4 0
1206 cat 4 0
1207 monkey 2 2
1209 **MultiIndex**
1211 >>> df.index = pd.MultiIndex.from_product([['mammal'],
1212 ... ['dog', 'cat', 'monkey']],
1213 ... names=['type', 'name'])
1214 >>> df
1215 limbs num_legs num_arms
1216 type name
1217 mammal dog 4 0
1218 cat 4 0
1219 monkey 2 2
1221 >>> df.rename_axis(index={'type': 'class'})
1222 limbs num_legs num_arms
1223 class name
1224 mammal dog 4 0
1225 cat 4 0
1226 monkey 2 2
1228 >>> df.rename_axis(columns=str.upper)
1229 LIMBS num_legs num_arms
1230 type name
1231 mammal dog 4 0
1232 cat 4 0
1233 monkey 2 2
1234 """
1235 axes, kwargs = self._construct_axes_from_arguments(
1236 (), kwargs, sentinel=lib.no_default
1237 )
1238 copy = kwargs.pop("copy", True)
1239 inplace = kwargs.pop("inplace", False)
1240 axis = kwargs.pop("axis", 0)
1241 if axis is not None:
1242 axis = self._get_axis_number(axis)
1244 if kwargs:
1245 raise TypeError(
1246 "rename_axis() got an unexpected keyword "
1247 f'argument "{list(kwargs.keys())[0]}"'
1248 )
1250 inplace = validate_bool_kwarg(inplace, "inplace")
1252 if mapper is not lib.no_default:
1253 # Use v0.23 behavior if a scalar or list
1254 non_mapper = is_scalar(mapper) or (
1255 is_list_like(mapper) and not is_dict_like(mapper)
1256 )
1257 if non_mapper:
1258 return self._set_axis_name(mapper, axis=axis, inplace=inplace)
1259 else:
1260 raise ValueError("Use `.rename` to alter labels with a mapper.")
1261 else:
1262 # Use new behavior. Means that index and/or columns
1263 # is specified
1264 result = self if inplace else self.copy(deep=copy)
1266 for axis in range(self._AXIS_LEN):
1267 v = axes.get(self._AXIS_NAMES[axis])
1268 if v is lib.no_default:
1269 continue
1270 non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))
1271 if non_mapper:
1272 newnames = v
1273 else:
1274 f = com.get_rename_function(v)
1275 curnames = self._get_axis(axis).names
1276 newnames = [f(name) for name in curnames]
1277 result._set_axis_name(newnames, axis=axis, inplace=True)
1278 if not inplace:
1279 return result
1281 def _set_axis_name(self, name, axis=0, inplace=False):
1282 """
1283 Set the name(s) of the axis.
1285 Parameters
1286 ----------
1287 name : str or list of str
1288 Name(s) to set.
1289 axis : {0 or 'index', 1 or 'columns'}, default 0
1290 The axis to set the label. The value 0 or 'index' specifies index,
1291 and the value 1 or 'columns' specifies columns.
1292 inplace : bool, default False
1293 If `True`, do operation inplace and return None.
1295 .. versionadded:: 0.21.0
1297 Returns
1298 -------
1299 Series, DataFrame, or None
1300 The same type as the caller or `None` if `inplace` is `True`.
1302 See Also
1303 --------
1304 DataFrame.rename : Alter the axis labels of :class:`DataFrame`.
1305 Series.rename : Alter the index labels or set the index name
1306 of :class:`Series`.
1307 Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.
1309 Examples
1310 --------
1311 >>> df = pd.DataFrame({"num_legs": [4, 4, 2]},
1312 ... ["dog", "cat", "monkey"])
1313 >>> df
1314 num_legs
1315 dog 4
1316 cat 4
1317 monkey 2
1318 >>> df._set_axis_name("animal")
1319 num_legs
1320 animal
1321 dog 4
1322 cat 4
1323 monkey 2
1324 >>> df.index = pd.MultiIndex.from_product(
1325 ... [["mammal"], ['dog', 'cat', 'monkey']])
1326 >>> df._set_axis_name(["type", "name"])
1327 legs
1328 type name
1329 mammal dog 4
1330 cat 4
1331 monkey 2
1332 """
1333 axis = self._get_axis_number(axis)
1334 idx = self._get_axis(axis).set_names(name)
1336 inplace = validate_bool_kwarg(inplace, "inplace")
1337 renamed = self if inplace else self.copy()
1338 renamed.set_axis(idx, axis=axis, inplace=True)
1339 if not inplace:
1340 return renamed
1342 # ----------------------------------------------------------------------
1343 # Comparison Methods
1345 def _indexed_same(self, other) -> bool:
1346 return all(
1347 self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS
1348 )
1350 def equals(self, other):
1351 """
1352 Test whether two objects contain the same elements.
1354 This function allows two Series or DataFrames to be compared against
1355 each other to see if they have the same shape and elements. NaNs in
1356 the same location are considered equal. The column headers do not
1357 need to have the same type, but the elements within the columns must
1358 be the same dtype.
1360 Parameters
1361 ----------
1362 other : Series or DataFrame
1363 The other Series or DataFrame to be compared with the first.
1365 Returns
1366 -------
1367 bool
1368 True if all elements are the same in both objects, False
1369 otherwise.
1371 See Also
1372 --------
1373 Series.eq : Compare two Series objects of the same length
1374 and return a Series where each element is True if the element
1375 in each Series is equal, False otherwise.
1376 DataFrame.eq : Compare two DataFrame objects of the same shape and
1377 return a DataFrame where each element is True if the respective
1378 element in each DataFrame is equal, False otherwise.
1379 testing.assert_series_equal : Raises an AssertionError if left and
1380 right are not equal. Provides an easy interface to ignore
1381 inequality in dtypes, indexes and precision among others.
1382 testing.assert_frame_equal : Like assert_series_equal, but targets
1383 DataFrames.
1384 numpy.array_equal : Return True if two arrays have the same shape
1385 and elements, False otherwise.
1387 Notes
1388 -----
1389 This function requires that the elements have the same dtype as their
1390 respective elements in the other Series or DataFrame. However, the
1391 column labels do not need to have the same type, as long as they are
1392 still considered equal.
1394 Examples
1395 --------
1396 >>> df = pd.DataFrame({1: [10], 2: [20]})
1397 >>> df
1398 1 2
1399 0 10 20
1401 DataFrames df and exactly_equal have the same types and values for
1402 their elements and column labels, which will return True.
1404 >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
1405 >>> exactly_equal
1406 1 2
1407 0 10 20
1408 >>> df.equals(exactly_equal)
1409 True
1411 DataFrames df and different_column_type have the same element
1412 types and values, but have different types for the column labels,
1413 which will still return True.
1415 >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
1416 >>> different_column_type
1417 1.0 2.0
1418 0 10 20
1419 >>> df.equals(different_column_type)
1420 True
1422 DataFrames df and different_data_type have different types for the
1423 same values for their elements, and will return False even though
1424 their column labels are the same values and types.
1426 >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
1427 >>> different_data_type
1428 1 2
1429 0 10.0 20.0
1430 >>> df.equals(different_data_type)
1431 False
1432 """
1433 if not isinstance(other, self._constructor):
1434 return False
1435 return self._data.equals(other._data)
1437 # -------------------------------------------------------------------------
1438 # Unary Methods
1440 def __neg__(self):
1441 values = com.values_from_object(self)
1442 if is_bool_dtype(values):
1443 arr = operator.inv(values)
1444 elif (
1445 is_numeric_dtype(values)
1446 or is_timedelta64_dtype(values)
1447 or is_object_dtype(values)
1448 ):
1449 arr = operator.neg(values)
1450 else:
1451 raise TypeError(f"Unary negative expects numeric dtype, not {values.dtype}")
1452 return self.__array_wrap__(arr)
1454 def __pos__(self):
1455 values = com.values_from_object(self)
1456 if is_bool_dtype(values) or is_period_arraylike(values):
1457 arr = values
1458 elif (
1459 is_numeric_dtype(values)
1460 or is_timedelta64_dtype(values)
1461 or is_object_dtype(values)
1462 ):
1463 arr = operator.pos(values)
1464 else:
1465 raise TypeError(f"Unary plus expects numeric dtype, not {values.dtype}")
1466 return self.__array_wrap__(arr)
1468 def __invert__(self):
1469 if not self.size:
1470 # inv fails with 0 len
1471 return self
1473 new_data = self._data.apply(operator.invert)
1474 result = self._constructor(new_data).__finalize__(self)
1475 return result
1477 def __nonzero__(self):
1478 raise ValueError(
1479 f"The truth value of a {type(self).__name__} is ambiguous. "
1480 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
1481 )
1483 __bool__ = __nonzero__
1485 def bool(self):
1486 """
1487 Return the bool of a single element PandasObject.
1489 This must be a boolean scalar value, either True or False. Raise a
1490 ValueError if the PandasObject does not have exactly 1 element, or that
1491 element is not boolean
1493 Returns
1494 -------
1495 bool
1496 Same single boolean value converted to bool type.
1497 """
1498 v = self.squeeze()
1499 if isinstance(v, (bool, np.bool_)):
1500 return bool(v)
1501 elif is_scalar(v):
1502 raise ValueError(
1503 "bool cannot act on a non-boolean single element "
1504 f"{type(self).__name__}"
1505 )
1507 self.__nonzero__()
1509 def __abs__(self: FrameOrSeries) -> FrameOrSeries:
1510 return self.abs()
1512 def __round__(self: FrameOrSeries, decimals: int = 0) -> FrameOrSeries:
1513 return self.round(decimals)
1515 # -------------------------------------------------------------------------
1516 # Label or Level Combination Helpers
1517 #
1518 # A collection of helper methods for DataFrame/Series operations that
1519 # accept a combination of column/index labels and levels. All such
1520 # operations should utilize/extend these methods when possible so that we
1521 # have consistent precedence and validation logic throughout the library.
1523 def _is_level_reference(self, key, axis=0):
1524 """
1525 Test whether a key is a level reference for a given axis.
1527 To be considered a level reference, `key` must be a string that:
1528 - (axis=0): Matches the name of an index level and does NOT match
1529 a column label.
1530 - (axis=1): Matches the name of a column level and does NOT match
1531 an index label.
1533 Parameters
1534 ----------
1535 key : str
1536 Potential level name for the given axis
1537 axis : int, default 0
1538 Axis that levels are associated with (0 for index, 1 for columns)
1540 Returns
1541 -------
1542 is_level : bool
1543 """
1544 axis = self._get_axis_number(axis)
1546 return (
1547 key is not None
1548 and is_hashable(key)
1549 and key in self.axes[axis].names
1550 and not self._is_label_reference(key, axis=axis)
1551 )
1553 def _is_label_reference(self, key, axis=0) -> bool_t:
1554 """
1555 Test whether a key is a label reference for a given axis.
1557 To be considered a label reference, `key` must be a string that:
1558 - (axis=0): Matches a column label
1559 - (axis=1): Matches an index label
1561 Parameters
1562 ----------
1563 key: str
1564 Potential label name
1565 axis: int, default 0
1566 Axis perpendicular to the axis that labels are associated with
1567 (0 means search for column labels, 1 means search for index labels)
1569 Returns
1570 -------
1571 is_label: bool
1572 """
1573 axis = self._get_axis_number(axis)
1574 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)
1576 return (
1577 key is not None
1578 and is_hashable(key)
1579 and any(key in self.axes[ax] for ax in other_axes)
1580 )
1582 def _is_label_or_level_reference(self, key: str, axis: int = 0) -> bool_t:
1583 """
1584 Test whether a key is a label or level reference for a given axis.
1586 To be considered either a label or a level reference, `key` must be a
1587 string that:
1588 - (axis=0): Matches a column label or an index level
1589 - (axis=1): Matches an index label or a column level
1591 Parameters
1592 ----------
1593 key: str
1594 Potential label or level name
1595 axis: int, default 0
1596 Axis that levels are associated with (0 for index, 1 for columns)
1598 Returns
1599 -------
1600 is_label_or_level: bool
1601 """
1602 return self._is_level_reference(key, axis=axis) or self._is_label_reference(
1603 key, axis=axis
1604 )
1606 def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None:
1607 """
1608 Check whether `key` is ambiguous.
1610 By ambiguous, we mean that it matches both a level of the input
1611 `axis` and a label of the other axis.
1613 Parameters
1614 ----------
1615 key: str or object
1616 Label or level name.
1617 axis: int, default 0
1618 Axis that levels are associated with (0 for index, 1 for columns).
1620 Raises
1621 ------
1622 ValueError: `key` is ambiguous
1623 """
1624 axis = self._get_axis_number(axis)
1625 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)
1627 if (
1628 key is not None
1629 and is_hashable(key)
1630 and key in self.axes[axis].names
1631 and any(key in self.axes[ax] for ax in other_axes)
1632 ):
1634 # Build an informative and grammatical warning
1635 level_article, level_type = (
1636 ("an", "index") if axis == 0 else ("a", "column")
1637 )
1639 label_article, label_type = (
1640 ("a", "column") if axis == 0 else ("an", "index")
1641 )
1643 msg = (
1644 f"'{key}' is both {level_article} {level_type} level and "
1645 f"{label_article} {label_type} label, which is ambiguous."
1646 )
1647 raise ValueError(msg)
1649 def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray:
1650 """
1651 Return a 1-D array of values associated with `key`, a label or level
1652 from the given `axis`.
1654 Retrieval logic:
1655 - (axis=0): Return column values if `key` matches a column label.
1656 Otherwise return index level values if `key` matches an index
1657 level.
1658 - (axis=1): Return row values if `key` matches an index label.
1659 Otherwise return column level values if 'key' matches a column
1660 level
1662 Parameters
1663 ----------
1664 key: str
1665 Label or level name.
1666 axis: int, default 0
1667 Axis that levels are associated with (0 for index, 1 for columns)
1669 Returns
1670 -------
1671 values: np.ndarray
1673 Raises
1674 ------
1675 KeyError
1676 if `key` matches neither a label nor a level
1677 ValueError
1678 if `key` matches multiple labels
1679 FutureWarning
1680 if `key` is ambiguous. This will become an ambiguity error in a
1681 future version
1682 """
1683 axis = self._get_axis_number(axis)
1684 other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
1686 if self._is_label_reference(key, axis=axis):
1687 self._check_label_or_level_ambiguity(key, axis=axis)
1688 values = self.xs(key, axis=other_axes[0])._values
1689 elif self._is_level_reference(key, axis=axis):
1690 values = self.axes[axis].get_level_values(key)._values
1691 else:
1692 raise KeyError(key)
1694 # Check for duplicates
1695 if values.ndim > 1:
1697 if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
1698 multi_message = (
1699 "\n"
1700 "For a multi-index, the label must be a "
1701 "tuple with elements corresponding to "
1702 "each level."
1703 )
1704 else:
1705 multi_message = ""
1707 label_axis_name = "column" if axis == 0 else "index"
1708 raise ValueError(
1709 (
1710 f"The {label_axis_name} label '{key}' "
1711 f"is not unique.{multi_message}"
1712 )
1713 )
1715 return values
1717 def _drop_labels_or_levels(self, keys, axis: int = 0):
1718 """
1719 Drop labels and/or levels for the given `axis`.
1721 For each key in `keys`:
1722 - (axis=0): If key matches a column label then drop the column.
1723 Otherwise if key matches an index level then drop the level.
1724 - (axis=1): If key matches an index label then drop the row.
1725 Otherwise if key matches a column level then drop the level.
1727 Parameters
1728 ----------
1729 keys: str or list of str
1730 labels or levels to drop
1731 axis: int, default 0
1732 Axis that levels are associated with (0 for index, 1 for columns)
1734 Returns
1735 -------
1736 dropped: DataFrame
1738 Raises
1739 ------
1740 ValueError
1741 if any `keys` match neither a label nor a level
1742 """
1743 axis = self._get_axis_number(axis)
1745 # Validate keys
1746 keys = com.maybe_make_list(keys)
1747 invalid_keys = [
1748 k for k in keys if not self._is_label_or_level_reference(k, axis=axis)
1749 ]
1751 if invalid_keys:
1752 raise ValueError(
1753 (
1754 "The following keys are not valid labels or "
1755 f"levels for axis {axis}: {invalid_keys}"
1756 )
1757 )
1759 # Compute levels and labels to drop
1760 levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]
1762 labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]
1764 # Perform copy upfront and then use inplace operations below.
1765 # This ensures that we always perform exactly one copy.
1766 # ``copy`` and/or ``inplace`` options could be added in the future.
1767 dropped = self.copy()
1769 if axis == 0:
1770 # Handle dropping index levels
1771 if levels_to_drop:
1772 dropped.reset_index(levels_to_drop, drop=True, inplace=True)
1774 # Handle dropping columns labels
1775 if labels_to_drop:
1776 dropped.drop(labels_to_drop, axis=1, inplace=True)
1777 else:
1778 # Handle dropping column levels
1779 if levels_to_drop:
1780 if isinstance(dropped.columns, MultiIndex):
1781 # Drop the specified levels from the MultiIndex
1782 dropped.columns = dropped.columns.droplevel(levels_to_drop)
1783 else:
1784 # Drop the last level of Index by replacing with
1785 # a RangeIndex
1786 dropped.columns = RangeIndex(dropped.columns.size)
1788 # Handle dropping index labels
1789 if labels_to_drop:
1790 dropped.drop(labels_to_drop, axis=0, inplace=True)
1792 return dropped
1794 # ----------------------------------------------------------------------
1795 # Iteration
1797 def __hash__(self):
1798 raise TypeError(
1799 f"{repr(type(self).__name__)} objects are mutable, "
1800 f"thus they cannot be hashed"
1801 )
1803 def __iter__(self):
1804 """
1805 Iterate over info axis.
1807 Returns
1808 -------
1809 iterator
1810 Info axis as iterator.
1811 """
1812 return iter(self._info_axis)
1814 # can we get a better explanation of this?
1815 def keys(self):
1816 """
1817 Get the 'info axis' (see Indexing for more).
1819 This is index for Series, columns for DataFrame.
1821 Returns
1822 -------
1823 Index
1824 Info axis.
1825 """
1826 return self._info_axis
1828 def items(self):
1829 """Iterate over (label, values) on info axis
1831 This is index for Series and columns for DataFrame.
1833 Returns
1834 -------
1835 Generator
1836 """
1837 for h in self._info_axis:
1838 yield h, self[h]
1840 @Appender(items.__doc__)
1841 def iteritems(self):
1842 return self.items()
1844 def __len__(self) -> int:
1845 """Returns length of info axis"""
1846 return len(self._info_axis)
1848 def __contains__(self, key) -> bool_t:
1849 """True if the key is in the info axis"""
1850 return key in self._info_axis
1852 @property
1853 def empty(self) -> bool_t:
1854 """
1855 Indicator whether DataFrame is empty.
1857 True if DataFrame is entirely empty (no items), meaning any of the
1858 axes are of length 0.
1860 Returns
1861 -------
1862 bool
1863 If DataFrame is empty, return True, if not return False.
1865 See Also
1866 --------
1867 Series.dropna
1868 DataFrame.dropna
1870 Notes
1871 -----
1872 If DataFrame contains only NaNs, it is still not considered empty. See
1873 the example below.
1875 Examples
1876 --------
1877 An example of an actual empty DataFrame. Notice the index is empty:
1879 >>> df_empty = pd.DataFrame({'A' : []})
1880 >>> df_empty
1881 Empty DataFrame
1882 Columns: [A]
1883 Index: []
1884 >>> df_empty.empty
1885 True
1887 If we only have NaNs in our DataFrame, it is not considered empty! We
1888 will need to drop the NaNs to make the DataFrame empty:
1890 >>> df = pd.DataFrame({'A' : [np.nan]})
1891 >>> df
1892 A
1893 0 NaN
1894 >>> df.empty
1895 False
1896 >>> df.dropna().empty
1897 True
1898 """
1899 return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)
1901 # ----------------------------------------------------------------------
1902 # Array Interface
1904 # This is also set in IndexOpsMixin
1905 # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
1906 __array_priority__ = 1000
1908 def __array__(self, dtype=None) -> np.ndarray:
1909 return com.values_from_object(self)
1911 def __array_wrap__(self, result, context=None):
1912 result = lib.item_from_zerodim(result)
1913 if is_scalar(result):
1914 # e.g. we get here with np.ptp(series)
1915 # ptp also requires the item_from_zerodim
1916 return result
1917 d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
1918 return self._constructor(result, **d).__finalize__(self)
1920 # ideally we would define this to avoid the getattr checks, but
1921 # is slower
1922 # @property
1923 # def __array_interface__(self):
1924 # """ provide numpy array interface method """
1925 # values = self.values
1926 # return dict(typestr=values.dtype.str,shape=values.shape,data=values)
1928 # ----------------------------------------------------------------------
1929 # Picklability
1931 def __getstate__(self) -> Dict[str, Any]:
1932 meta = {k: getattr(self, k, None) for k in self._metadata}
1933 return dict(
1934 _data=self._data,
1935 _typ=self._typ,
1936 _metadata=self._metadata,
1937 attrs=self.attrs,
1938 **meta,
1939 )
1941 def __setstate__(self, state):
1943 if isinstance(state, BlockManager):
1944 self._data = state
1945 elif isinstance(state, dict):
1946 typ = state.get("_typ")
1947 if typ is not None:
1948 attrs = state.get("_attrs", {})
1949 object.__setattr__(self, "_attrs", attrs)
1951 # set in the order of internal names
1952 # to avoid definitional recursion
1953 # e.g. say fill_value needing _data to be
1954 # defined
1955 meta = set(self._internal_names + self._metadata)
1956 for k in list(meta):
1957 if k in state:
1958 v = state[k]
1959 object.__setattr__(self, k, v)
1961 for k, v in state.items():
1962 if k not in meta:
1963 object.__setattr__(self, k, v)
1965 else:
1966 self._unpickle_series_compat(state)
1967 elif len(state) == 2:
1968 self._unpickle_series_compat(state)
1970 self._item_cache = {}
1972 # ----------------------------------------------------------------------
1973 # Rendering Methods
1975 def __repr__(self) -> str:
1976 # string representation based upon iterating over self
1977 # (since, by definition, `PandasContainers` are iterable)
1978 prepr = f"[{','.join(map(pprint_thing, self))}]"
1979 return f"{type(self).__name__}({prepr})"
1981 def _repr_latex_(self):
1982 """
1983 Returns a LaTeX representation for a particular object.
1984 Mainly for use with nbconvert (jupyter notebook conversion to pdf).
1985 """
1986 if config.get_option("display.latex.repr"):
1987 return self.to_latex()
1988 else:
1989 return None
1991 def _repr_data_resource_(self):
1992 """
1993 Not a real Jupyter special repr method, but we use the same
1994 naming convention.
1995 """
1996 if config.get_option("display.html.table_schema"):
1997 data = self.head(config.get_option("display.max_rows"))
1998 payload = json.loads(
1999 data.to_json(orient="table"), object_pairs_hook=collections.OrderedDict
2000 )
2001 return payload
2003 # ----------------------------------------------------------------------
2004 # I/O Methods
2006 _shared_docs[
2007 "to_markdown"
2008 ] = """
2009 Print %(klass)s in Markdown-friendly format.
2011 .. versionadded:: 1.0.0
2013 Parameters
2014 ----------
2015 buf : writable buffer, defaults to sys.stdout
2016 Where to send the output. By default, the output is printed to
2017 sys.stdout. Pass a writable buffer if you need to further process
2018 the output.
2019 mode : str, optional
2020 Mode in which file is opened.
2021 **kwargs
2022 These parameters will be passed to `tabulate`.
2024 Returns
2025 -------
2026 str
2027 %(klass)s in Markdown-friendly format.
2028 """
2030 _shared_docs[
2031 "to_excel"
2032 ] = """
2033 Write %(klass)s to an Excel sheet.
2035 To write a single %(klass)s to an Excel .xlsx file it is only necessary to
2036 specify a target file name. To write to multiple sheets it is necessary to
2037 create an `ExcelWriter` object with a target file name, and specify a sheet
2038 in the file to write to.
2040 Multiple sheets may be written to by specifying unique `sheet_name`.
2041 With all data written to the file it is necessary to save the changes.
2042 Note that creating an `ExcelWriter` object with a file name that already
2043 exists will result in the contents of the existing file being erased.
2045 Parameters
2046 ----------
2047 excel_writer : str or ExcelWriter object
2048 File path or existing ExcelWriter.
2049 sheet_name : str, default 'Sheet1'
2050 Name of sheet which will contain DataFrame.
2051 na_rep : str, default ''
2052 Missing data representation.
2053 float_format : str, optional
2054 Format string for floating point numbers. For example
2055 ``float_format="%%.2f"`` will format 0.1234 to 0.12.
2056 columns : sequence or list of str, optional
2057 Columns to write.
2058 header : bool or list of str, default True
2059 Write out the column names. If a list of string is given it is
2060 assumed to be aliases for the column names.
2061 index : bool, default True
2062 Write row names (index).
2063 index_label : str or sequence, optional
2064 Column label for index column(s) if desired. If not specified, and
2065 `header` and `index` are True, then the index names are used. A
2066 sequence should be given if the DataFrame uses MultiIndex.
2067 startrow : int, default 0
2068 Upper left cell row to dump data frame.
2069 startcol : int, default 0
2070 Upper left cell column to dump data frame.
2071 engine : str, optional
2072 Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
2073 via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
2074 ``io.excel.xlsm.writer``.
2075 merge_cells : bool, default True
2076 Write MultiIndex and Hierarchical Rows as merged cells.
2077 encoding : str, optional
2078 Encoding of the resulting excel file. Only necessary for xlwt,
2079 other writers support unicode natively.
2080 inf_rep : str, default 'inf'
2081 Representation for infinity (there is no native representation for
2082 infinity in Excel).
2083 verbose : bool, default True
2084 Display more information in the error logs.
2085 freeze_panes : tuple of int (length 2), optional
2086 Specifies the one-based bottommost row and rightmost column that
2087 is to be frozen.
2089 See Also
2090 --------
2091 to_csv : Write DataFrame to a comma-separated values (csv) file.
2092 ExcelWriter : Class for writing DataFrame objects into excel sheets.
2093 read_excel : Read an Excel file into a pandas DataFrame.
2094 read_csv : Read a comma-separated values (csv) file into DataFrame.
2096 Notes
2097 -----
2098 For compatibility with :meth:`~DataFrame.to_csv`,
2099 to_excel serializes lists and dicts to strings before writing.
2101 Once a workbook has been saved it is not possible write further data
2102 without rewriting the whole workbook.
2104 Examples
2105 --------
2107 Create, write to and save a workbook:
2109 >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
2110 ... index=['row 1', 'row 2'],
2111 ... columns=['col 1', 'col 2'])
2112 >>> df1.to_excel("output.xlsx") # doctest: +SKIP
2114 To specify the sheet name:
2116 >>> df1.to_excel("output.xlsx",
2117 ... sheet_name='Sheet_name_1') # doctest: +SKIP
2119 If you wish to write to more than one sheet in the workbook, it is
2120 necessary to specify an ExcelWriter object:
2122 >>> df2 = df1.copy()
2123 >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
2124 ... df1.to_excel(writer, sheet_name='Sheet_name_1')
2125 ... df2.to_excel(writer, sheet_name='Sheet_name_2')
2127 ExcelWriter can also be used to append to an existing Excel file:
2129 >>> with pd.ExcelWriter('output.xlsx',
2130 ... mode='a') as writer: # doctest: +SKIP
2131 ... df.to_excel(writer, sheet_name='Sheet_name_3')
2133 To set the library that is used to write the Excel file,
2134 you can pass the `engine` keyword (the default engine is
2135 automatically chosen depending on the file extension):
2137 >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
2138 """
2140 @Appender(_shared_docs["to_excel"] % dict(klass="object"))
2141 def to_excel(
2142 self,
2143 excel_writer,
2144 sheet_name="Sheet1",
2145 na_rep="",
2146 float_format=None,
2147 columns=None,
2148 header=True,
2149 index=True,
2150 index_label=None,
2151 startrow=0,
2152 startcol=0,
2153 engine=None,
2154 merge_cells=True,
2155 encoding=None,
2156 inf_rep="inf",
2157 verbose=True,
2158 freeze_panes=None,
2159 ) -> None:
2160 df = self if isinstance(self, ABCDataFrame) else self.to_frame()
2162 from pandas.io.formats.excel import ExcelFormatter
2164 formatter = ExcelFormatter(
2165 df,
2166 na_rep=na_rep,
2167 cols=columns,
2168 header=header,
2169 float_format=float_format,
2170 index=index,
2171 index_label=index_label,
2172 merge_cells=merge_cells,
2173 inf_rep=inf_rep,
2174 )
2175 formatter.write(
2176 excel_writer,
2177 sheet_name=sheet_name,
2178 startrow=startrow,
2179 startcol=startcol,
2180 freeze_panes=freeze_panes,
2181 engine=engine,
2182 )
2184 def to_json(
2185 self,
2186 path_or_buf: Optional[FilePathOrBuffer] = None,
2187 orient: Optional[str] = None,
2188 date_format: Optional[str] = None,
2189 double_precision: int = 10,
2190 force_ascii: bool_t = True,
2191 date_unit: str = "ms",
2192 default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
2193 lines: bool_t = False,
2194 compression: Optional[str] = "infer",
2195 index: bool_t = True,
2196 indent: Optional[int] = None,
2197 ) -> Optional[str]:
2198 """
2199 Convert the object to a JSON string.
2201 Note NaN's and None will be converted to null and datetime objects
2202 will be converted to UNIX timestamps.
2204 Parameters
2205 ----------
2206 path_or_buf : str or file handle, optional
2207 File path or object. If not specified, the result is returned as
2208 a string.
2209 orient : str
2210 Indication of expected JSON string format.
2212 * Series:
2214 - default is 'index'
2215 - allowed values are: {'split','records','index','table'}.
2217 * DataFrame:
2219 - default is 'columns'
2220 - allowed values are: {'split', 'records', 'index', 'columns',
2221 'values', 'table'}.
2223 * The format of the JSON string:
2225 - 'split' : dict like {'index' -> [index], 'columns' -> [columns],
2226 'data' -> [values]}
2227 - 'records' : list like [{column -> value}, ... , {column -> value}]
2228 - 'index' : dict like {index -> {column -> value}}
2229 - 'columns' : dict like {column -> {index -> value}}
2230 - 'values' : just the values array
2231 - 'table' : dict like {'schema': {schema}, 'data': {data}}
2233 Describing the data, where data component is like ``orient='records'``.
2235 .. versionchanged:: 0.20.0
2237 date_format : {None, 'epoch', 'iso'}
2238 Type of date conversion. 'epoch' = epoch milliseconds,
2239 'iso' = ISO8601. The default depends on the `orient`. For
2240 ``orient='table'``, the default is 'iso'. For all other orients,
2241 the default is 'epoch'.
2242 double_precision : int, default 10
2243 The number of decimal places to use when encoding
2244 floating point values.
2245 force_ascii : bool, default True
2246 Force encoded string to be ASCII.
2247 date_unit : str, default 'ms' (milliseconds)
2248 The time unit to encode to, governs timestamp and ISO8601
2249 precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
2250 microsecond, and nanosecond respectively.
2251 default_handler : callable, default None
2252 Handler to call if object cannot otherwise be converted to a
2253 suitable format for JSON. Should receive a single argument which is
2254 the object to convert and return a serialisable object.
2255 lines : bool, default False
2256 If 'orient' is 'records' write out line delimited json format. Will
2257 throw ValueError if incorrect 'orient' since others are not list
2258 like.
2260 compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
2262 A string representing the compression to use in the output file,
2263 only used when the first argument is a filename. By default, the
2264 compression is inferred from the filename.
2266 .. versionadded:: 0.21.0
2267 .. versionchanged:: 0.24.0
2268 'infer' option added and set to default
2269 index : bool, default True
2270 Whether to include the index values in the JSON string. Not
2271 including the index (``index=False``) is only supported when
2272 orient is 'split' or 'table'.
2274 .. versionadded:: 0.23.0
2276 indent : int, optional
2277 Length of whitespace used to indent each record.
2279 .. versionadded:: 1.0.0
2281 Returns
2282 -------
2283 None or str
2284 If path_or_buf is None, returns the resulting json format as a
2285 string. Otherwise returns None.
2287 See Also
2288 --------
2289 read_json
2291 Notes
2292 -----
2293 The behavior of ``indent=0`` varies from the stdlib, which does not
2294 indent the output but does insert newlines. Currently, ``indent=0``
2295 and the default ``indent=None`` are equivalent in pandas, though this
2296 may change in a future release.
2298 Examples
2299 --------
2301 >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
2302 ... index=['row 1', 'row 2'],
2303 ... columns=['col 1', 'col 2'])
2304 >>> df.to_json(orient='split')
2305 '{"columns":["col 1","col 2"],
2306 "index":["row 1","row 2"],
2307 "data":[["a","b"],["c","d"]]}'
2309 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
2310 Note that index labels are not preserved with this encoding.
2312 >>> df.to_json(orient='records')
2313 '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
2315 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
2317 >>> df.to_json(orient='index')
2318 '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
2320 Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
2322 >>> df.to_json(orient='columns')
2323 '{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}'
2325 Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
2327 >>> df.to_json(orient='values')
2328 '[["a","b"],["c","d"]]'
2330 Encoding with Table Schema
2332 >>> df.to_json(orient='table')
2333 '{"schema": {"fields": [{"name": "index", "type": "string"},
2334 {"name": "col 1", "type": "string"},
2335 {"name": "col 2", "type": "string"}],
2336 "primaryKey": "index",
2337 "pandas_version": "0.20.0"},
2338 "data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
2339 {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
2340 """
2342 from pandas.io import json
2344 if date_format is None and orient == "table":
2345 date_format = "iso"
2346 elif date_format is None:
2347 date_format = "epoch"
2349 config.is_nonnegative_int(indent)
2350 indent = indent or 0
2352 return json.to_json(
2353 path_or_buf=path_or_buf,
2354 obj=self,
2355 orient=orient,
2356 date_format=date_format,
2357 double_precision=double_precision,
2358 force_ascii=force_ascii,
2359 date_unit=date_unit,
2360 default_handler=default_handler,
2361 lines=lines,
2362 compression=compression,
2363 index=index,
2364 indent=indent,
2365 )
2367 def to_hdf(
2368 self,
2369 path_or_buf,
2370 key: str,
2371 mode: str = "a",
2372 complevel: Optional[int] = None,
2373 complib: Optional[str] = None,
2374 append: bool_t = False,
2375 format: Optional[str] = None,
2376 index: bool_t = True,
2377 min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
2378 nan_rep=None,
2379 dropna: Optional[bool_t] = None,
2380 data_columns: Optional[List[str]] = None,
2381 errors: str = "strict",
2382 encoding: str = "UTF-8",
2383 ) -> None:
2384 """
2385 Write the contained data to an HDF5 file using HDFStore.
2387 Hierarchical Data Format (HDF) is self-describing, allowing an
2388 application to interpret the structure and contents of a file with
2389 no outside information. One HDF file can hold a mix of related objects
2390 which can be accessed as a group or as individual objects.
2392 In order to add another DataFrame or Series to an existing HDF file
2393 please use append mode and a different a key.
2395 For more information see the :ref:`user guide <io.hdf5>`.
2397 Parameters
2398 ----------
2399 path_or_buf : str or pandas.HDFStore
2400 File path or HDFStore object.
2401 key : str
2402 Identifier for the group in the store.
2403 mode : {'a', 'w', 'r+'}, default 'a'
2404 Mode to open file:
2406 - 'w': write, a new file is created (an existing file with
2407 the same name would be deleted).
2408 - 'a': append, an existing file is opened for reading and
2409 writing, and if the file does not exist it is created.
2410 - 'r+': similar to 'a', but the file must already exist.
2411 complevel : {0-9}, optional
2412 Specifies a compression level for data.
2413 A value of 0 disables compression.
2414 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
2415 Specifies the compression library to be used.
2416 As of v0.20.2 these additional compressors for Blosc are supported
2417 (default if no compressor specified: 'blosc:blosclz'):
2418 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
2419 'blosc:zlib', 'blosc:zstd'}.
2420 Specifying a compression library which is not available issues
2421 a ValueError.
2422 append : bool, default False
2423 For Table formats, append the input data to the existing.
2424 format : {'fixed', 'table', None}, default 'fixed'
2425 Possible values:
2427 - 'fixed': Fixed format. Fast writing/reading. Not-appendable,
2428 nor searchable.
2429 - 'table': Table format. Write as a PyTables Table structure
2430 which may perform worse but allow more flexible operations
2431 like searching / selecting subsets of the data.
2432 - If None, pd.get_option('io.hdf.default_format') is checked,
2433 followed by fallback to "fixed"
2434 errors : str, default 'strict'
2435 Specifies how encoding and decoding errors are to be handled.
2436 See the errors argument for :func:`open` for a full list
2437 of options.
2438 encoding : str, default "UTF-8"
2439 min_itemsize : dict or int, optional
2440 Map column names to minimum string sizes for columns.
2441 nan_rep : Any, optional
2442 How to represent null values as str.
2443 Not allowed with append=True.
2444 data_columns : list of columns or True, optional
2445 List of columns to create as indexed data columns for on-disk
2446 queries, or True to use all columns. By default only the axes
2447 of the object are indexed. See :ref:`io.hdf5-query-data-columns`.
2448 Applicable only to format='table'.
2450 See Also
2451 --------
2452 DataFrame.read_hdf : Read from HDF file.
2453 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
2454 DataFrame.to_sql : Write to a sql table.
2455 DataFrame.to_feather : Write out feather-format for DataFrames.
2456 DataFrame.to_csv : Write out to a csv file.
2458 Examples
2459 --------
2460 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
2461 ... index=['a', 'b', 'c'])
2462 >>> df.to_hdf('data.h5', key='df', mode='w')
2464 We can add another object to the same file:
2466 >>> s = pd.Series([1, 2, 3, 4])
2467 >>> s.to_hdf('data.h5', key='s')
2469 Reading from HDF file:
2471 >>> pd.read_hdf('data.h5', 'df')
2472 A B
2473 a 1 4
2474 b 2 5
2475 c 3 6
2476 >>> pd.read_hdf('data.h5', 's')
2477 0 1
2478 1 2
2479 2 3
2480 3 4
2481 dtype: int64
2483 Deleting file with data:
2485 >>> import os
2486 >>> os.remove('data.h5')
2487 """
2488 from pandas.io import pytables
2490 pytables.to_hdf(
2491 path_or_buf,
2492 key,
2493 self,
2494 mode=mode,
2495 complevel=complevel,
2496 complib=complib,
2497 append=append,
2498 format=format,
2499 index=index,
2500 min_itemsize=min_itemsize,
2501 nan_rep=nan_rep,
2502 dropna=dropna,
2503 data_columns=data_columns,
2504 errors=errors,
2505 encoding=encoding,
2506 )
2508 def to_sql(
2509 self,
2510 name: str,
2511 con,
2512 schema=None,
2513 if_exists: str = "fail",
2514 index: bool_t = True,
2515 index_label=None,
2516 chunksize=None,
2517 dtype=None,
2518 method=None,
2519 ) -> None:
2520 """
2521 Write records stored in a DataFrame to a SQL database.
2523 Databases supported by SQLAlchemy [1]_ are supported. Tables can be
2524 newly created, appended to, or overwritten.
2526 Parameters
2527 ----------
2528 name : str
2529 Name of SQL table.
2530 con : sqlalchemy.engine.Engine or sqlite3.Connection
2531 Using SQLAlchemy makes it possible to use any DB supported by that
2532 library. Legacy support is provided for sqlite3.Connection objects. The user
2533 is responsible for engine disposal and connection closure for the SQLAlchemy
2534 connectable See `here \
2535 <https://docs.sqlalchemy.org/en/13/core/connections.html>`_
2537 schema : str, optional
2538 Specify the schema (if database flavor supports this). If None, use
2539 default schema.
2540 if_exists : {'fail', 'replace', 'append'}, default 'fail'
2541 How to behave if the table already exists.
2543 * fail: Raise a ValueError.
2544 * replace: Drop the table before inserting new values.
2545 * append: Insert new values to the existing table.
2547 index : bool, default True
2548 Write DataFrame index as a column. Uses `index_label` as the column
2549 name in the table.
2550 index_label : str or sequence, default None
2551 Column label for index column(s). If None is given (default) and
2552 `index` is True, then the index names are used.
2553 A sequence should be given if the DataFrame uses MultiIndex.
2554 chunksize : int, optional
2555 Specify the number of rows in each batch to be written at a time.
2556 By default, all rows will be written at once.
2557 dtype : dict or scalar, optional
2558 Specifying the datatype for columns. If a dictionary is used, the
2559 keys should be the column names and the values should be the
2560 SQLAlchemy types or strings for the sqlite3 legacy mode. If a
2561 scalar is provided, it will be applied to all columns.
2562 method : {None, 'multi', callable}, optional
2563 Controls the SQL insertion clause used:
2565 * None : Uses standard SQL ``INSERT`` clause (one per row).
2566 * 'multi': Pass multiple values in a single ``INSERT`` clause.
2567 * callable with signature ``(pd_table, conn, keys, data_iter)``.
2569 Details and a sample callable implementation can be found in the
2570 section :ref:`insert method <io.sql.method>`.
2572 .. versionadded:: 0.24.0
2574 Raises
2575 ------
2576 ValueError
2577 When the table already exists and `if_exists` is 'fail' (the
2578 default).
2580 See Also
2581 --------
2582 read_sql : Read a DataFrame from a table.
2584 Notes
2585 -----
2586 Timezone aware datetime columns will be written as
2587 ``Timestamp with timezone`` type with SQLAlchemy if supported by the
2588 database. Otherwise, the datetimes will be stored as timezone unaware
2589 timestamps local to the original timezone.
2591 .. versionadded:: 0.24.0
2593 References
2594 ----------
2595 .. [1] http://docs.sqlalchemy.org
2596 .. [2] https://www.python.org/dev/peps/pep-0249/
2598 Examples
2599 --------
2601 Create an in-memory SQLite database.
2603 >>> from sqlalchemy import create_engine
2604 >>> engine = create_engine('sqlite://', echo=False)
2606 Create a table from scratch with 3 rows.
2608 >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
2609 >>> df
2610 name
2611 0 User 1
2612 1 User 2
2613 2 User 3
2615 >>> df.to_sql('users', con=engine)
2616 >>> engine.execute("SELECT * FROM users").fetchall()
2617 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
2619 >>> df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})
2620 >>> df1.to_sql('users', con=engine, if_exists='append')
2621 >>> engine.execute("SELECT * FROM users").fetchall()
2622 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
2623 (0, 'User 4'), (1, 'User 5')]
2625 Overwrite the table with just ``df1``.
2627 >>> df1.to_sql('users', con=engine, if_exists='replace',
2628 ... index_label='id')
2629 >>> engine.execute("SELECT * FROM users").fetchall()
2630 [(0, 'User 4'), (1, 'User 5')]
2632 Specify the dtype (especially useful for integers with missing values).
2633 Notice that while pandas is forced to store the data as floating point,
2634 the database supports nullable integers. When fetching the data with
2635 Python, we get back integer scalars.
2637 >>> df = pd.DataFrame({"A": [1, None, 2]})
2638 >>> df
2639 A
2640 0 1.0
2641 1 NaN
2642 2 2.0
2644 >>> from sqlalchemy.types import Integer
2645 >>> df.to_sql('integers', con=engine, index=False,
2646 ... dtype={"A": Integer()})
2648 >>> engine.execute("SELECT * FROM integers").fetchall()
2649 [(1,), (None,), (2,)]
2650 """
2651 from pandas.io import sql
2653 sql.to_sql(
2654 self,
2655 name,
2656 con,
2657 schema=schema,
2658 if_exists=if_exists,
2659 index=index,
2660 index_label=index_label,
2661 chunksize=chunksize,
2662 dtype=dtype,
2663 method=method,
2664 )
2666 def to_pickle(
2667 self,
2668 path,
2669 compression: Optional[str] = "infer",
2670 protocol: int = pickle.HIGHEST_PROTOCOL,
2671 ) -> None:
2672 """
2673 Pickle (serialize) object to file.
2675 Parameters
2676 ----------
2677 path : str
2678 File path where the pickled object will be stored.
2679 compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \
2680 default 'infer'
2681 A string representing the compression to use in the output file. By
2682 default, infers from the file extension in specified path.
2683 protocol : int
2684 Int which indicates which protocol should be used by the pickler,
2685 default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
2686 values are 0, 1, 2, 3, 4. A negative value for the protocol
2687 parameter is equivalent to setting its value to HIGHEST_PROTOCOL.
2689 .. [1] https://docs.python.org/3/library/pickle.html.
2690 .. versionadded:: 0.21.0.
2692 See Also
2693 --------
2694 read_pickle : Load pickled pandas object (or any object) from file.
2695 DataFrame.to_hdf : Write DataFrame to an HDF5 file.
2696 DataFrame.to_sql : Write DataFrame to a SQL database.
2697 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
2699 Examples
2700 --------
2701 >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
2702 >>> original_df
2703 foo bar
2704 0 0 5
2705 1 1 6
2706 2 2 7
2707 3 3 8
2708 4 4 9
2709 >>> original_df.to_pickle("./dummy.pkl")
2711 >>> unpickled_df = pd.read_pickle("./dummy.pkl")
2712 >>> unpickled_df
2713 foo bar
2714 0 0 5
2715 1 1 6
2716 2 2 7
2717 3 3 8
2718 4 4 9
2720 >>> import os
2721 >>> os.remove("./dummy.pkl")
2722 """
2723 from pandas.io.pickle import to_pickle
2725 to_pickle(self, path, compression=compression, protocol=protocol)
2727 def to_clipboard(
2728 self, excel: bool_t = True, sep: Optional[str] = None, **kwargs
2729 ) -> None:
2730 r"""
2731 Copy object to the system clipboard.
2733 Write a text representation of object to the system clipboard.
2734 This can be pasted into Excel, for example.
2736 Parameters
2737 ----------
2738 excel : bool, default True
2739 Produce output in a csv format for easy pasting into excel.
2741 - True, use the provided separator for csv pasting.
2742 - False, write a string representation of the object to the clipboard.
2744 sep : str, default ``'\t'``
2745 Field delimiter.
2746 **kwargs
2747 These parameters will be passed to DataFrame.to_csv.
2749 See Also
2750 --------
2751 DataFrame.to_csv : Write a DataFrame to a comma-separated values
2752 (csv) file.
2753 read_clipboard : Read text from clipboard and pass to read_table.
2755 Notes
2756 -----
2757 Requirements for your platform.
2759 - Linux : `xclip`, or `xsel` (with `PyQt4` modules)
2760 - Windows : none
2761 - OS X : none
2763 Examples
2764 --------
2765 Copy the contents of a DataFrame to the clipboard.
2767 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
2768 >>> df.to_clipboard(sep=',')
2769 ... # Wrote the following to the system clipboard:
2770 ... # ,A,B,C
2771 ... # 0,1,2,3
2772 ... # 1,4,5,6
2774 We can omit the the index by passing the keyword `index` and setting
2775 it to false.
2777 >>> df.to_clipboard(sep=',', index=False)
2778 ... # Wrote the following to the system clipboard:
2779 ... # A,B,C
2780 ... # 1,2,3
2781 ... # 4,5,6
2782 """
2783 from pandas.io import clipboards
2785 clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)
2787 def to_xarray(self):
2788 """
2789 Return an xarray object from the pandas object.
2791 Returns
2792 -------
2793 xarray.DataArray or xarray.Dataset
2794 Data in the pandas structure converted to Dataset if the object is
2795 a DataFrame, or a DataArray if the object is a Series.
2797 See Also
2798 --------
2799 DataFrame.to_hdf : Write DataFrame to an HDF5 file.
2800 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
2802 Notes
2803 -----
2804 See the `xarray docs <http://xarray.pydata.org/en/stable/>`__
2806 Examples
2807 --------
2808 >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),
2809 ... ('parrot', 'bird', 24.0, 2),
2810 ... ('lion', 'mammal', 80.5, 4),
2811 ... ('monkey', 'mammal', np.nan, 4)],
2812 ... columns=['name', 'class', 'max_speed',
2813 ... 'num_legs'])
2814 >>> df
2815 name class max_speed num_legs
2816 0 falcon bird 389.0 2
2817 1 parrot bird 24.0 2
2818 2 lion mammal 80.5 4
2819 3 monkey mammal NaN 4
2821 >>> df.to_xarray()
2822 <xarray.Dataset>
2823 Dimensions: (index: 4)
2824 Coordinates:
2825 * index (index) int64 0 1 2 3
2826 Data variables:
2827 name (index) object 'falcon' 'parrot' 'lion' 'monkey'
2828 class (index) object 'bird' 'bird' 'mammal' 'mammal'
2829 max_speed (index) float64 389.0 24.0 80.5 nan
2830 num_legs (index) int64 2 2 4 4
2832 >>> df['max_speed'].to_xarray()
2833 <xarray.DataArray 'max_speed' (index: 4)>
2834 array([389. , 24. , 80.5, nan])
2835 Coordinates:
2836 * index (index) int64 0 1 2 3
2838 >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',
2839 ... '2018-01-02', '2018-01-02'])
2840 >>> df_multiindex = pd.DataFrame({'date': dates,
2841 ... 'animal': ['falcon', 'parrot',
2842 ... 'falcon', 'parrot'],
2843 ... 'speed': [350, 18, 361, 15]})
2844 >>> df_multiindex = df_multiindex.set_index(['date', 'animal'])
2846 >>> df_multiindex
2847 speed
2848 date animal
2849 2018-01-01 falcon 350
2850 parrot 18
2851 2018-01-02 falcon 361
2852 parrot 15
2854 >>> df_multiindex.to_xarray()
2855 <xarray.Dataset>
2856 Dimensions: (animal: 2, date: 2)
2857 Coordinates:
2858 * date (date) datetime64[ns] 2018-01-01 2018-01-02
2859 * animal (animal) object 'falcon' 'parrot'
2860 Data variables:
2861 speed (date, animal) int64 350 18 361 15
2862 """
2863 xarray = import_optional_dependency("xarray")
2865 if self.ndim == 1:
2866 return xarray.DataArray.from_series(self)
2867 else:
2868 return xarray.Dataset.from_dataframe(self)
2870 @Substitution(returns=fmt.return_docstring)
2871 def to_latex(
2872 self,
2873 buf=None,
2874 columns=None,
2875 col_space=None,
2876 header=True,
2877 index=True,
2878 na_rep="NaN",
2879 formatters=None,
2880 float_format=None,
2881 sparsify=None,
2882 index_names=True,
2883 bold_rows=False,
2884 column_format=None,
2885 longtable=None,
2886 escape=None,
2887 encoding=None,
2888 decimal=".",
2889 multicolumn=None,
2890 multicolumn_format=None,
2891 multirow=None,
2892 caption=None,
2893 label=None,
2894 ):
2895 r"""
2896 Render object to a LaTeX tabular, longtable, or nested table/tabular.
2898 Requires ``\usepackage{booktabs}``. The output can be copy/pasted
2899 into a main LaTeX document or read from an external file
2900 with ``\input{table.tex}``.
2902 .. versionchanged:: 0.20.2
2903 Added to Series.
2905 .. versionchanged:: 1.0.0
2906 Added caption and label arguments.
2908 Parameters
2909 ----------
2910 buf : str, Path or StringIO-like, optional, default None
2911 Buffer to write to. If None, the output is returned as a string.
2912 columns : list of label, optional
2913 The subset of columns to write. Writes all columns by default.
2914 col_space : int, optional
2915 The minimum width of each column.
2916 header : bool or list of str, default True
2917 Write out the column names. If a list of strings is given,
2918 it is assumed to be aliases for the column names.
2919 index : bool, default True
2920 Write row names (index).
2921 na_rep : str, default 'NaN'
2922 Missing data representation.
2923 formatters : list of functions or dict of {str: function}, optional
2924 Formatter functions to apply to columns' elements by position or
2925 name. The result of each function must be a unicode string.
2926 List must be of length equal to the number of columns.
2927 float_format : one-parameter function or str, optional, default None
2928 Formatter for floating point numbers. For example
2929 ``float_format="%%.2f"`` and ``float_format="{:0.2f}".format`` will
2930 both result in 0.1234 being formatted as 0.12.
2931 sparsify : bool, optional
2932 Set to False for a DataFrame with a hierarchical index to print
2933 every multiindex key at each row. By default, the value will be
2934 read from the config module.
2935 index_names : bool, default True
2936 Prints the names of the indexes.
2937 bold_rows : bool, default False
2938 Make the row labels bold in the output.
2939 column_format : str, optional
2940 The columns format as specified in `LaTeX table format
2941 <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3
2942 columns. By default, 'l' will be used for all columns except
2943 columns of numbers, which default to 'r'.
2944 longtable : bool, optional
2945 By default, the value will be read from the pandas config
2946 module. Use a longtable environment instead of tabular. Requires
2947 adding a \usepackage{longtable} to your LaTeX preamble.
2948 escape : bool, optional
2949 By default, the value will be read from the pandas config
2950 module. When set to False prevents from escaping latex special
2951 characters in column names.
2952 encoding : str, optional
2953 A string representing the encoding to use in the output file,
2954 defaults to 'utf-8'.
2955 decimal : str, default '.'
2956 Character recognized as decimal separator, e.g. ',' in Europe.
2957 multicolumn : bool, default True
2958 Use \multicolumn to enhance MultiIndex columns.
2959 The default will be read from the config module.
2960 multicolumn_format : str, default 'l'
2961 The alignment for multicolumns, similar to `column_format`
2962 The default will be read from the config module.
2963 multirow : bool, default False
2964 Use \multirow to enhance MultiIndex rows. Requires adding a
2965 \usepackage{multirow} to your LaTeX preamble. Will print
2966 centered labels (instead of top-aligned) across the contained
2967 rows, separating groups via clines. The default will be read
2968 from the pandas config module.
2969 caption : str, optional
2970 The LaTeX caption to be placed inside ``\caption{}`` in the output.
2972 .. versionadded:: 1.0.0
2974 label : str, optional
2975 The LaTeX label to be placed inside ``\label{}`` in the output.
2976 This is used with ``\ref{}`` in the main ``.tex`` file.
2978 .. versionadded:: 1.0.0
2979 %(returns)s
2980 See Also
2981 --------
2982 DataFrame.to_string : Render a DataFrame to a console-friendly
2983 tabular output.
2984 DataFrame.to_html : Render a DataFrame as an HTML table.
2986 Examples
2987 --------
2988 >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
2989 ... 'mask': ['red', 'purple'],
2990 ... 'weapon': ['sai', 'bo staff']})
2991 >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE
2992 \begin{tabular}{lll}
2993 \toprule
2994 name & mask & weapon \\
2995 \midrule
2996 Raphael & red & sai \\
2997 Donatello & purple & bo staff \\
2998 \bottomrule
2999 \end{tabular}
3000 """
3001 # Get defaults from the pandas config
3002 if self.ndim == 1:
3003 self = self.to_frame()
3004 if longtable is None:
3005 longtable = config.get_option("display.latex.longtable")
3006 if escape is None:
3007 escape = config.get_option("display.latex.escape")
3008 if multicolumn is None:
3009 multicolumn = config.get_option("display.latex.multicolumn")
3010 if multicolumn_format is None:
3011 multicolumn_format = config.get_option("display.latex.multicolumn_format")
3012 if multirow is None:
3013 multirow = config.get_option("display.latex.multirow")
3015 formatter = DataFrameFormatter(
3016 self,
3017 columns=columns,
3018 col_space=col_space,
3019 na_rep=na_rep,
3020 header=header,
3021 index=index,
3022 formatters=formatters,
3023 float_format=float_format,
3024 bold_rows=bold_rows,
3025 sparsify=sparsify,
3026 index_names=index_names,
3027 escape=escape,
3028 decimal=decimal,
3029 )
3030 return formatter.to_latex(
3031 buf=buf,
3032 column_format=column_format,
3033 longtable=longtable,
3034 encoding=encoding,
3035 multicolumn=multicolumn,
3036 multicolumn_format=multicolumn_format,
3037 multirow=multirow,
3038 caption=caption,
3039 label=label,
3040 )
3042 def to_csv(
3043 self,
3044 path_or_buf: Optional[FilePathOrBuffer] = None,
3045 sep: str = ",",
3046 na_rep: str = "",
3047 float_format: Optional[str] = None,
3048 columns: Optional[Sequence[Optional[Hashable]]] = None,
3049 header: Union[bool_t, List[str]] = True,
3050 index: bool_t = True,
3051 index_label: Optional[Union[bool_t, str, Sequence[Optional[Hashable]]]] = None,
3052 mode: str = "w",
3053 encoding: Optional[str] = None,
3054 compression: Optional[Union[str, Mapping[str, str]]] = "infer",
3055 quoting: Optional[int] = None,
3056 quotechar: str = '"',
3057 line_terminator: Optional[str] = None,
3058 chunksize: Optional[int] = None,
3059 date_format: Optional[str] = None,
3060 doublequote: bool_t = True,
3061 escapechar: Optional[str] = None,
3062 decimal: Optional[str] = ".",
3063 ) -> Optional[str]:
3064 r"""
3065 Write object to a comma-separated values (csv) file.
3067 .. versionchanged:: 0.24.0
3068 The order of arguments for Series was changed.
3070 Parameters
3071 ----------
3072 path_or_buf : str or file handle, default None
3073 File path or object, if None is provided the result is returned as
3074 a string. If a file object is passed it should be opened with
3075 `newline=''`, disabling universal newlines.
3077 .. versionchanged:: 0.24.0
3079 Was previously named "path" for Series.
3081 sep : str, default ','
3082 String of length 1. Field delimiter for the output file.
3083 na_rep : str, default ''
3084 Missing data representation.
3085 float_format : str, default None
3086 Format string for floating point numbers.
3087 columns : sequence, optional
3088 Columns to write.
3089 header : bool or list of str, default True
3090 Write out the column names. If a list of strings is given it is
3091 assumed to be aliases for the column names.
3093 .. versionchanged:: 0.24.0
3095 Previously defaulted to False for Series.
3097 index : bool, default True
3098 Write row names (index).
3099 index_label : str or sequence, or False, default None
3100 Column label for index column(s) if desired. If None is given, and
3101 `header` and `index` are True, then the index names are used. A
3102 sequence should be given if the object uses MultiIndex. If
3103 False do not print fields for index names. Use index_label=False
3104 for easier importing in R.
3105 mode : str
3106 Python write mode, default 'w'.
3107 encoding : str, optional
3108 A string representing the encoding to use in the output file,
3109 defaults to 'utf-8'.
3110 compression : str or dict, default 'infer'
3111 If str, represents compression mode. If dict, value at 'method' is
3112 the compression mode. Compression mode may be any of the following
3113 possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If
3114 compression mode is 'infer' and `path_or_buf` is path-like, then
3115 detect compression mode from the following extensions: '.gz',
3116 '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
3117 and mode is 'zip' or inferred as 'zip', other entries passed as
3118 additional compression options.
3120 .. versionchanged:: 1.0.0
3122 May now be a dict with key 'method' as compression mode
3123 and other entries as additional compression options if
3124 compression mode is 'zip'.
3126 quoting : optional constant from csv module
3127 Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
3128 then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
3129 will treat them as non-numeric.
3130 quotechar : str, default '\"'
3131 String of length 1. Character used to quote fields.
3132 line_terminator : str, optional
3133 The newline character or character sequence to use in the output
3134 file. Defaults to `os.linesep`, which depends on the OS in which
3135 this method is called ('\n' for linux, '\r\n' for Windows, i.e.).
3137 .. versionchanged:: 0.24.0
3138 chunksize : int or None
3139 Rows to write at a time.
3140 date_format : str, default None
3141 Format string for datetime objects.
3142 doublequote : bool, default True
3143 Control quoting of `quotechar` inside a field.
3144 escapechar : str, default None
3145 String of length 1. Character used to escape `sep` and `quotechar`
3146 when appropriate.
3147 decimal : str, default '.'
3148 Character recognized as decimal separator. E.g. use ',' for
3149 European data.
3151 Returns
3152 -------
3153 None or str
3154 If path_or_buf is None, returns the resulting csv format as a
3155 string. Otherwise returns None.
3157 See Also
3158 --------
3159 read_csv : Load a CSV file into a DataFrame.
3160 to_excel : Write DataFrame to an Excel file.
3162 Examples
3163 --------
3164 >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
3165 ... 'mask': ['red', 'purple'],
3166 ... 'weapon': ['sai', 'bo staff']})
3167 >>> df.to_csv(index=False)
3168 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
3170 Create 'out.zip' containing 'out.csv'
3172 >>> compression_opts = dict(method='zip',
3173 ... archive_name='out.csv') # doctest: +SKIP
3174 >>> df.to_csv('out.zip', index=False,
3175 ... compression=compression_opts) # doctest: +SKIP
3176 """
3178 df = self if isinstance(self, ABCDataFrame) else self.to_frame()
3180 from pandas.io.formats.csvs import CSVFormatter
3182 formatter = CSVFormatter(
3183 df,
3184 path_or_buf,
3185 line_terminator=line_terminator,
3186 sep=sep,
3187 encoding=encoding,
3188 compression=compression,
3189 quoting=quoting,
3190 na_rep=na_rep,
3191 float_format=float_format,
3192 cols=columns,
3193 header=header,
3194 index=index,
3195 index_label=index_label,
3196 mode=mode,
3197 chunksize=chunksize,
3198 quotechar=quotechar,
3199 date_format=date_format,
3200 doublequote=doublequote,
3201 escapechar=escapechar,
3202 decimal=decimal,
3203 )
3204 formatter.save()
3206 if path_or_buf is None:
3207 return formatter.path_or_buf.getvalue()
3209 return None
3211 # ----------------------------------------------------------------------
3212 # Fancy Indexing
3214 @classmethod
3215 def _create_indexer(cls, name: str, indexer) -> None:
3216 """Create an indexer like _name in the class.
3218 Kept for compatibility with geopandas. To be removed in the future. See GH27258
3219 """
3220 if getattr(cls, name, None) is None:
3221 _indexer = functools.partial(indexer, name)
3222 setattr(cls, name, property(_indexer, doc=indexer.__doc__))
3224 # ----------------------------------------------------------------------
3225 # Lookup Caching
3227 def _set_as_cached(self, item, cacher) -> None:
3228 """Set the _cacher attribute on the calling object with a weakref to
3229 cacher.
3230 """
3231 self._cacher = (item, weakref.ref(cacher))
3233 def _reset_cacher(self) -> None:
3234 """Reset the cacher."""
3235 if hasattr(self, "_cacher"):
3236 del self._cacher
3238 def _maybe_cache_changed(self, item, value) -> None:
3239 """The object has called back to us saying maybe it has changed.
3240 """
3241 self._data.set(item, value)
3243 @property
3244 def _is_cached(self) -> bool_t:
3245 """Return boolean indicating if self is cached or not."""
3246 return getattr(self, "_cacher", None) is not None
3248 def _get_cacher(self):
3249 """return my cacher or None"""
3250 cacher = getattr(self, "_cacher", None)
3251 if cacher is not None:
3252 cacher = cacher[1]()
3253 return cacher
3255 def _maybe_update_cacher(
3256 self, clear: bool_t = False, verify_is_copy: bool_t = True
3257 ) -> None:
3258 """
3259 See if we need to update our parent cacher if clear, then clear our
3260 cache.
3262 Parameters
3263 ----------
3264 clear : bool, default False
3265 Clear the item cache.
3266 verify_is_copy : bool, default True
3267 Provide is_copy checks.
3268 """
3270 cacher = getattr(self, "_cacher", None)
3271 if cacher is not None:
3272 ref = cacher[1]()
3274 # we are trying to reference a dead referant, hence
3275 # a copy
3276 if ref is None:
3277 del self._cacher
3278 else:
3279 # Note: we need to call ref._maybe_cache_changed even in the
3280 # case where it will raise. (Uh, not clear why)
3281 try:
3282 ref._maybe_cache_changed(cacher[0], self)
3283 except AssertionError:
3284 # ref._data.setitem can raise
3285 # AssertionError because of shape mismatch
3286 pass
3288 if verify_is_copy:
3289 self._check_setitem_copy(stacklevel=5, t="referant")
3291 if clear:
3292 self._clear_item_cache()
3294 def _clear_item_cache(self) -> None:
3295 self._item_cache.clear()
3297 # ----------------------------------------------------------------------
3298 # Indexing Methods
3300 def take(
3301 self: FrameOrSeries, indices, axis=0, is_copy: Optional[bool_t] = None, **kwargs
3302 ) -> FrameOrSeries:
3303 """
3304 Return the elements in the given *positional* indices along an axis.
3306 This means that we are not indexing according to actual values in
3307 the index attribute of the object. We are indexing according to the
3308 actual position of the element in the object.
3310 Parameters
3311 ----------
3312 indices : array-like
3313 An array of ints indicating which positions to take.
3314 axis : {0 or 'index', 1 or 'columns', None}, default 0
3315 The axis on which to select elements. ``0`` means that we are
3316 selecting rows, ``1`` means that we are selecting columns.
3317 is_copy : bool
3318 Before pandas 1.0, ``is_copy=False`` can be specified to ensure
3319 that the return value is an actual copy. Starting with pandas 1.0,
3320 ``take`` always returns a copy, and the keyword is therefore
3321 deprecated.
3323 .. deprecated:: 1.0.0
3324 **kwargs
3325 For compatibility with :meth:`numpy.take`. Has no effect on the
3326 output.
3328 Returns
3329 -------
3330 taken : same type as caller
3331 An array-like containing the elements taken from the object.
3333 See Also
3334 --------
3335 DataFrame.loc : Select a subset of a DataFrame by labels.
3336 DataFrame.iloc : Select a subset of a DataFrame by positions.
3337 numpy.take : Take elements from an array along an axis.
3339 Examples
3340 --------
3341 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
3342 ... ('parrot', 'bird', 24.0),
3343 ... ('lion', 'mammal', 80.5),
3344 ... ('monkey', 'mammal', np.nan)],
3345 ... columns=['name', 'class', 'max_speed'],
3346 ... index=[0, 2, 3, 1])
3347 >>> df
3348 name class max_speed
3349 0 falcon bird 389.0
3350 2 parrot bird 24.0
3351 3 lion mammal 80.5
3352 1 monkey mammal NaN
3354 Take elements at positions 0 and 3 along the axis 0 (default).
3356 Note how the actual indices selected (0 and 1) do not correspond to
3357 our selected indices 0 and 3. That's because we are selecting the 0th
3358 and 3rd rows, not rows whose indices equal 0 and 3.
3360 >>> df.take([0, 3])
3361 name class max_speed
3362 0 falcon bird 389.0
3363 1 monkey mammal NaN
3365 Take elements at indices 1 and 2 along the axis 1 (column selection).
3367 >>> df.take([1, 2], axis=1)
3368 class max_speed
3369 0 bird 389.0
3370 2 bird 24.0
3371 3 mammal 80.5
3372 1 mammal NaN
3374 We may take elements using negative integers for positive indices,
3375 starting from the end of the object, just like with Python lists.
3377 >>> df.take([-1, -2])
3378 name class max_speed
3379 1 monkey mammal NaN
3380 3 lion mammal 80.5
3381 """
3382 if is_copy is not None:
3383 warnings.warn(
3384 "is_copy is deprecated and will be removed in a future version. "
3385 "'take' always returns a copy, so there is no need to specify this.",
3386 FutureWarning,
3387 stacklevel=2,
3388 )
3390 nv.validate_take(tuple(), kwargs)
3392 self._consolidate_inplace()
3394 new_data = self._data.take(
3395 indices, axis=self._get_block_manager_axis(axis), verify=True
3396 )
3397 return self._constructor(new_data).__finalize__(self)
3399 def _take_with_is_copy(
3400 self: FrameOrSeries, indices, axis=0, **kwargs
3401 ) -> FrameOrSeries:
3402 """
3403 Internal version of the `take` method that sets the `_is_copy`
3404 attribute to keep track of the parent dataframe (using in indexing
3405 for the SettingWithCopyWarning).
3407 See the docstring of `take` for full explanation of the parameters.
3408 """
3409 result = self.take(indices=indices, axis=axis, **kwargs)
3410 # Maybe set copy if we didn't actually change the index.
3411 if not result._get_axis(axis).equals(self._get_axis(axis)):
3412 result._set_is_copy(self)
3413 return result
3415 def xs(self, key, axis=0, level=None, drop_level: bool_t = True):
3416 """
3417 Return cross-section from the Series/DataFrame.
3419 This method takes a `key` argument to select data at a particular
3420 level of a MultiIndex.
3422 Parameters
3423 ----------
3424 key : label or tuple of label
3425 Label contained in the index, or partially in a MultiIndex.
3426 axis : {0 or 'index', 1 or 'columns'}, default 0
3427 Axis to retrieve cross-section on.
3428 level : object, defaults to first n levels (n=1 or len(key))
3429 In case of a key partially contained in a MultiIndex, indicate
3430 which levels are used. Levels can be referred by label or position.
3431 drop_level : bool, default True
3432 If False, returns object with same levels as self.
3434 Returns
3435 -------
3436 Series or DataFrame
3437 Cross-section from the original Series or DataFrame
3438 corresponding to the selected index levels.
3440 See Also
3441 --------
3442 DataFrame.loc : Access a group of rows and columns
3443 by label(s) or a boolean array.
3444 DataFrame.iloc : Purely integer-location based indexing
3445 for selection by position.
3447 Notes
3448 -----
3449 `xs` can not be used to set values.
3451 MultiIndex Slicers is a generic way to get/set values on
3452 any level or levels.
3453 It is a superset of `xs` functionality, see
3454 :ref:`MultiIndex Slicers <advanced.mi_slicers>`.
3456 Examples
3457 --------
3458 >>> d = {'num_legs': [4, 4, 2, 2],
3459 ... 'num_wings': [0, 0, 2, 2],
3460 ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
3461 ... 'animal': ['cat', 'dog', 'bat', 'penguin'],
3462 ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
3463 >>> df = pd.DataFrame(data=d)
3464 >>> df = df.set_index(['class', 'animal', 'locomotion'])
3465 >>> df
3466 num_legs num_wings
3467 class animal locomotion
3468 mammal cat walks 4 0
3469 dog walks 4 0
3470 bat flies 2 2
3471 bird penguin walks 2 2
3473 Get values at specified index
3475 >>> df.xs('mammal')
3476 num_legs num_wings
3477 animal locomotion
3478 cat walks 4 0
3479 dog walks 4 0
3480 bat flies 2 2
3482 Get values at several indexes
3484 >>> df.xs(('mammal', 'dog'))
3485 num_legs num_wings
3486 locomotion
3487 walks 4 0
3489 Get values at specified index and level
3491 >>> df.xs('cat', level=1)
3492 num_legs num_wings
3493 class locomotion
3494 mammal walks 4 0
3496 Get values at several indexes and levels
3498 >>> df.xs(('bird', 'walks'),
3499 ... level=[0, 'locomotion'])
3500 num_legs num_wings
3501 animal
3502 penguin 2 2
3504 Get values at specified column and axis
3506 >>> df.xs('num_wings', axis=1)
3507 class animal locomotion
3508 mammal cat walks 0
3509 dog walks 0
3510 bat flies 2
3511 bird penguin walks 2
3512 Name: num_wings, dtype: int64
3513 """
3514 axis = self._get_axis_number(axis)
3515 labels = self._get_axis(axis)
3516 if level is not None:
3517 loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
3519 # create the tuple of the indexer
3520 _indexer = [slice(None)] * self.ndim
3521 _indexer[axis] = loc
3522 indexer = tuple(_indexer)
3524 result = self.iloc[indexer]
3525 setattr(result, result._get_axis_name(axis), new_ax)
3526 return result
3528 if axis == 1:
3529 return self[key]
3531 self._consolidate_inplace()
3533 index = self.index
3534 if isinstance(index, MultiIndex):
3535 loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
3536 else:
3537 loc = self.index.get_loc(key)
3539 if isinstance(loc, np.ndarray):
3540 if loc.dtype == np.bool_:
3541 (inds,) = loc.nonzero()
3542 return self._take_with_is_copy(inds, axis=axis)
3543 else:
3544 return self._take_with_is_copy(loc, axis=axis)
3546 if not is_scalar(loc):
3547 new_index = self.index[loc]
3549 if is_scalar(loc):
3550 new_values = self._data.fast_xs(loc)
3552 # may need to box a datelike-scalar
3553 #
3554 # if we encounter an array-like and we only have 1 dim
3555 # that means that their are list/ndarrays inside the Series!
3556 # so just return them (GH 6394)
3557 if not is_list_like(new_values) or self.ndim == 1:
3558 return com.maybe_box_datetimelike(new_values)
3560 result = self._constructor_sliced(
3561 new_values,
3562 index=self.columns,
3563 name=self.index[loc],
3564 dtype=new_values.dtype,
3565 )
3567 else:
3568 result = self.iloc[loc]
3569 result.index = new_index
3571 # this could be a view
3572 # but only in a single-dtyped view sliceable case
3573 result._set_is_copy(self, copy=not result._is_view)
3574 return result
3576 _xs: Callable = xs
3578 def __getitem__(self, item):
3579 raise AbstractMethodError(self)
3581 def _get_item_cache(self, item):
3582 """Return the cached item, item represents a label indexer."""
3583 cache = self._item_cache
3584 res = cache.get(item)
3585 if res is None:
3586 values = self._data.get(item)
3587 res = self._box_item_values(item, values)
3588 cache[item] = res
3589 res._set_as_cached(item, self)
3591 # for a chain
3592 res._is_copy = self._is_copy
3593 return res
3595 def _iget_item_cache(self, item):
3596 """Return the cached item, item represents a positional indexer."""
3597 ax = self._info_axis
3598 if ax.is_unique:
3599 lower = self._get_item_cache(ax[item])
3600 else:
3601 lower = self._take_with_is_copy(item, axis=self._info_axis_number)
3602 return lower
3604 def _box_item_values(self, key, values):
3605 raise AbstractMethodError(self)
3607 def _slice(self: FrameOrSeries, slobj: slice, axis=0, kind=None) -> FrameOrSeries:
3608 """
3609 Construct a slice of this container.
3611 kind parameter is maintained for compatibility with Series slicing.
3612 """
3613 axis = self._get_block_manager_axis(axis)
3614 result = self._constructor(self._data.get_slice(slobj, axis=axis))
3615 result = result.__finalize__(self)
3617 # this could be a view
3618 # but only in a single-dtyped view sliceable case
3619 is_copy = axis != 0 or result._is_view
3620 result._set_is_copy(self, copy=is_copy)
3621 return result
3623 def _set_item(self, key, value) -> None:
3624 self._data.set(key, value)
3625 self._clear_item_cache()
3627 def _set_is_copy(self, ref=None, copy: bool_t = True) -> None:
3628 if not copy:
3629 self._is_copy = None
3630 else:
3631 if ref is not None:
3632 self._is_copy = weakref.ref(ref)
3633 else:
3634 self._is_copy = None
3636 def _check_is_chained_assignment_possible(self) -> bool_t:
3637 """
3638 Check if we are a view, have a cacher, and are of mixed type.
3639 If so, then force a setitem_copy check.
3641 Should be called just near setting a value
3643 Will return a boolean if it we are a view and are cached, but a
3644 single-dtype meaning that the cacher should be updated following
3645 setting.
3646 """
3647 if self._is_view and self._is_cached:
3648 ref = self._get_cacher()
3649 if ref is not None and ref._is_mixed_type:
3650 self._check_setitem_copy(stacklevel=4, t="referant", force=True)
3651 return True
3652 elif self._is_copy:
3653 self._check_setitem_copy(stacklevel=4, t="referant")
3654 return False
3656 def _check_setitem_copy(self, stacklevel=4, t="setting", force=False):
3657 """
3659 Parameters
3660 ----------
3661 stacklevel : int, default 4
3662 the level to show of the stack when the error is output
3663 t : str, the type of setting error
3664 force : bool, default False
3665 If True, then force showing an error.
3667 validate if we are doing a setitem on a chained copy.
3669 If you call this function, be sure to set the stacklevel such that the
3670 user will see the error *at the level of setting*
3672 It is technically possible to figure out that we are setting on
3673 a copy even WITH a multi-dtyped pandas object. In other words, some
3674 blocks may be views while other are not. Currently _is_view will ALWAYS
3675 return False for multi-blocks to avoid having to handle this case.
3677 df = DataFrame(np.arange(0,9), columns=['count'])
3678 df['group'] = 'b'
3680 # This technically need not raise SettingWithCopy if both are view
3681 # (which is not # generally guaranteed but is usually True. However,
3682 # this is in general not a good practice and we recommend using .loc.
3683 df.iloc[0:5]['group'] = 'a'
3685 """
3687 # return early if the check is not needed
3688 if not (force or self._is_copy):
3689 return
3691 value = config.get_option("mode.chained_assignment")
3692 if value is None:
3693 return
3695 # see if the copy is not actually referred; if so, then dissolve
3696 # the copy weakref
3697 if self._is_copy is not None and not isinstance(self._is_copy, str):
3698 r = self._is_copy()
3699 if not gc.get_referents(r) or r.shape == self.shape:
3700 self._is_copy = None
3701 return
3703 # a custom message
3704 if isinstance(self._is_copy, str):
3705 t = self._is_copy
3707 elif t == "referant":
3708 t = (
3709 "\n"
3710 "A value is trying to be set on a copy of a slice from a "
3711 "DataFrame\n\n"
3712 "See the caveats in the documentation: "
3713 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
3714 "indexing.html#returning-a-view-versus-a-copy"
3715 )
3717 else:
3718 t = (
3719 "\n"
3720 "A value is trying to be set on a copy of a slice from a "
3721 "DataFrame.\n"
3722 "Try using .loc[row_indexer,col_indexer] = value "
3723 "instead\n\nSee the caveats in the documentation: "
3724 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
3725 "indexing.html#returning-a-view-versus-a-copy"
3726 )
3728 if value == "raise":
3729 raise com.SettingWithCopyError(t)
3730 elif value == "warn":
3731 warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel)
3733 def __delitem__(self, key) -> None:
3734 """
3735 Delete item
3736 """
3737 deleted = False
3739 maybe_shortcut = False
3740 if self.ndim == 2 and isinstance(self.columns, MultiIndex):
3741 try:
3742 maybe_shortcut = key not in self.columns._engine
3743 except TypeError:
3744 pass
3746 if maybe_shortcut:
3747 # Allow shorthand to delete all columns whose first len(key)
3748 # elements match key:
3749 if not isinstance(key, tuple):
3750 key = (key,)
3751 for col in self.columns:
3752 if isinstance(col, tuple) and col[: len(key)] == key:
3753 del self[col]
3754 deleted = True
3755 if not deleted:
3756 # If the above loop ran and didn't delete anything because
3757 # there was no match, this call should raise the appropriate
3758 # exception:
3759 self._data.delete(key)
3761 # delete from the caches
3762 try:
3763 del self._item_cache[key]
3764 except KeyError:
3765 pass
3767 # ----------------------------------------------------------------------
3768 # Unsorted
3770 def get(self, key, default=None):
3771 """
3772 Get item from object for given key (ex: DataFrame column).
3774 Returns default value if not found.
3776 Parameters
3777 ----------
3778 key : object
3780 Returns
3781 -------
3782 value : same type as items contained in object
3783 """
3784 try:
3785 return self[key]
3786 except (KeyError, ValueError, IndexError):
3787 return default
3789 @property
3790 def _is_view(self):
3791 """Return boolean indicating if self is view of another array """
3792 return self._data.is_view
3794 def reindex_like(
3795 self: FrameOrSeries,
3796 other,
3797 method: Optional[str] = None,
3798 copy: bool_t = True,
3799 limit=None,
3800 tolerance=None,
3801 ) -> FrameOrSeries:
3802 """
3803 Return an object with matching indices as other object.
3805 Conform the object to the same index on all axes. Optional
3806 filling logic, placing NaN in locations having no value
3807 in the previous index. A new object is produced unless the
3808 new index is equivalent to the current one and copy=False.
3810 Parameters
3811 ----------
3812 other : Object of the same data type
3813 Its row and column indices are used to define the new indices
3814 of this object.
3815 method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
3816 Method to use for filling holes in reindexed DataFrame.
3817 Please note: this is only applicable to DataFrames/Series with a
3818 monotonically increasing/decreasing index.
3820 * None (default): don't fill gaps
3821 * pad / ffill: propagate last valid observation forward to next
3822 valid
3823 * backfill / bfill: use next valid observation to fill gap
3824 * nearest: use nearest valid observations to fill gap.
3826 copy : bool, default True
3827 Return a new object, even if the passed indexes are the same.
3828 limit : int, default None
3829 Maximum number of consecutive labels to fill for inexact matches.
3830 tolerance : optional
3831 Maximum distance between original and new labels for inexact
3832 matches. The values of the index at the matching locations most
3833 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
3835 Tolerance may be a scalar value, which applies the same tolerance
3836 to all values, or list-like, which applies variable tolerance per
3837 element. List-like includes list, tuple, array, Series, and must be
3838 the same size as the index and its dtype must exactly match the
3839 index's type.
3841 .. versionadded:: 0.21.0 (list-like tolerance)
3843 Returns
3844 -------
3845 Series or DataFrame
3846 Same type as caller, but with changed indices on each axis.
3848 See Also
3849 --------
3850 DataFrame.set_index : Set row labels.
3851 DataFrame.reset_index : Remove row labels or move them to new columns.
3852 DataFrame.reindex : Change to new indices or expand indices.
3854 Notes
3855 -----
3856 Same as calling
3857 ``.reindex(index=other.index, columns=other.columns,...)``.
3859 Examples
3860 --------
3861 >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],
3862 ... [31, 87.8, 'high'],
3863 ... [22, 71.6, 'medium'],
3864 ... [35, 95, 'medium']],
3865 ... columns=['temp_celsius', 'temp_fahrenheit',
3866 ... 'windspeed'],
3867 ... index=pd.date_range(start='2014-02-12',
3868 ... end='2014-02-15', freq='D'))
3870 >>> df1
3871 temp_celsius temp_fahrenheit windspeed
3872 2014-02-12 24.3 75.7 high
3873 2014-02-13 31.0 87.8 high
3874 2014-02-14 22.0 71.6 medium
3875 2014-02-15 35.0 95.0 medium
3877 >>> df2 = pd.DataFrame([[28, 'low'],
3878 ... [30, 'low'],
3879 ... [35.1, 'medium']],
3880 ... columns=['temp_celsius', 'windspeed'],
3881 ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
3882 ... '2014-02-15']))
3884 >>> df2
3885 temp_celsius windspeed
3886 2014-02-12 28.0 low
3887 2014-02-13 30.0 low
3888 2014-02-15 35.1 medium
3890 >>> df2.reindex_like(df1)
3891 temp_celsius temp_fahrenheit windspeed
3892 2014-02-12 28.0 NaN low
3893 2014-02-13 30.0 NaN low
3894 2014-02-14 NaN NaN NaN
3895 2014-02-15 35.1 NaN medium
3896 """
3897 d = other._construct_axes_dict(
3898 axes=self._AXIS_ORDERS,
3899 method=method,
3900 copy=copy,
3901 limit=limit,
3902 tolerance=tolerance,
3903 )
3905 return self.reindex(**d)
3907 def drop(
3908 self,
3909 labels=None,
3910 axis=0,
3911 index=None,
3912 columns=None,
3913 level=None,
3914 inplace: bool_t = False,
3915 errors: str = "raise",
3916 ):
3918 inplace = validate_bool_kwarg(inplace, "inplace")
3920 if labels is not None:
3921 if index is not None or columns is not None:
3922 raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
3923 axis_name = self._get_axis_name(axis)
3924 axes = {axis_name: labels}
3925 elif index is not None or columns is not None:
3926 axes, _ = self._construct_axes_from_arguments((index, columns), {})
3927 else:
3928 raise ValueError(
3929 "Need to specify at least one of 'labels', 'index' or 'columns'"
3930 )
3932 obj = self
3934 for axis, labels in axes.items():
3935 if labels is not None:
3936 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
3938 if inplace:
3939 self._update_inplace(obj)
3940 else:
3941 return obj
3943 def _drop_axis(
3944 self: FrameOrSeries, labels, axis, level=None, errors: str = "raise"
3945 ) -> FrameOrSeries:
3946 """
3947 Drop labels from specified axis. Used in the ``drop`` method
3948 internally.
3950 Parameters
3951 ----------
3952 labels : single label or list-like
3953 axis : int or axis name
3954 level : int or level name, default None
3955 For MultiIndex
3956 errors : {'ignore', 'raise'}, default 'raise'
3957 If 'ignore', suppress error and existing labels are dropped.
3959 """
3960 axis = self._get_axis_number(axis)
3961 axis_name = self._get_axis_name(axis)
3962 axis = self._get_axis(axis)
3964 if axis.is_unique:
3965 if level is not None:
3966 if not isinstance(axis, MultiIndex):
3967 raise AssertionError("axis must be a MultiIndex")
3968 new_axis = axis.drop(labels, level=level, errors=errors)
3969 else:
3970 new_axis = axis.drop(labels, errors=errors)
3971 result = self.reindex(**{axis_name: new_axis})
3973 # Case for non-unique axis
3974 else:
3975 labels = ensure_object(com.index_labels_to_array(labels))
3976 if level is not None:
3977 if not isinstance(axis, MultiIndex):
3978 raise AssertionError("axis must be a MultiIndex")
3979 indexer = ~axis.get_level_values(level).isin(labels)
3981 # GH 18561 MultiIndex.drop should raise if label is absent
3982 if errors == "raise" and indexer.all():
3983 raise KeyError(f"{labels} not found in axis")
3984 else:
3985 indexer = ~axis.isin(labels)
3986 # Check if label doesn't exist along axis
3987 labels_missing = (axis.get_indexer_for(labels) == -1).any()
3988 if errors == "raise" and labels_missing:
3989 raise KeyError(f"{labels} not found in axis")
3991 slicer = [slice(None)] * self.ndim
3992 slicer[self._get_axis_number(axis_name)] = indexer
3994 result = self.loc[tuple(slicer)]
3996 return result
3998 def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None:
3999 """
4000 Replace self internals with result.
4002 Parameters
4003 ----------
4004 verify_is_copy : bool, default True
4005 Provide is_copy checks.
4006 """
4007 # NOTE: This does *not* call __finalize__ and that's an explicit
4008 # decision that we may revisit in the future.
4010 self._reset_cache()
4011 self._clear_item_cache()
4012 self._data = getattr(result, "_data", result)
4013 self._maybe_update_cacher(verify_is_copy=verify_is_copy)
4015 def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries:
4016 """
4017 Prefix labels with string `prefix`.
4019 For Series, the row labels are prefixed.
4020 For DataFrame, the column labels are prefixed.
4022 Parameters
4023 ----------
4024 prefix : str
4025 The string to add before each label.
4027 Returns
4028 -------
4029 Series or DataFrame
4030 New Series or DataFrame with updated labels.
4032 See Also
4033 --------
4034 Series.add_suffix: Suffix row labels with string `suffix`.
4035 DataFrame.add_suffix: Suffix column labels with string `suffix`.
4037 Examples
4038 --------
4039 >>> s = pd.Series([1, 2, 3, 4])
4040 >>> s
4041 0 1
4042 1 2
4043 2 3
4044 3 4
4045 dtype: int64
4047 >>> s.add_prefix('item_')
4048 item_0 1
4049 item_1 2
4050 item_2 3
4051 item_3 4
4052 dtype: int64
4054 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
4055 >>> df
4056 A B
4057 0 1 3
4058 1 2 4
4059 2 3 5
4060 3 4 6
4062 >>> df.add_prefix('col_')
4063 col_A col_B
4064 0 1 3
4065 1 2 4
4066 2 3 5
4067 3 4 6
4068 """
4069 f = functools.partial("{prefix}{}".format, prefix=prefix)
4071 mapper = {self._info_axis_name: f}
4072 return self.rename(**mapper) # type: ignore
4074 def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries:
4075 """
4076 Suffix labels with string `suffix`.
4078 For Series, the row labels are suffixed.
4079 For DataFrame, the column labels are suffixed.
4081 Parameters
4082 ----------
4083 suffix : str
4084 The string to add after each label.
4086 Returns
4087 -------
4088 Series or DataFrame
4089 New Series or DataFrame with updated labels.
4091 See Also
4092 --------
4093 Series.add_prefix: Prefix row labels with string `prefix`.
4094 DataFrame.add_prefix: Prefix column labels with string `prefix`.
4096 Examples
4097 --------
4098 >>> s = pd.Series([1, 2, 3, 4])
4099 >>> s
4100 0 1
4101 1 2
4102 2 3
4103 3 4
4104 dtype: int64
4106 >>> s.add_suffix('_item')
4107 0_item 1
4108 1_item 2
4109 2_item 3
4110 3_item 4
4111 dtype: int64
4113 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
4114 >>> df
4115 A B
4116 0 1 3
4117 1 2 4
4118 2 3 5
4119 3 4 6
4121 >>> df.add_suffix('_col')
4122 A_col B_col
4123 0 1 3
4124 1 2 4
4125 2 3 5
4126 3 4 6
4127 """
4128 f = functools.partial("{}{suffix}".format, suffix=suffix)
4130 mapper = {self._info_axis_name: f}
4131 return self.rename(**mapper) # type: ignore
4133 def sort_values(
4134 self,
4135 by=None,
4136 axis=0,
4137 ascending=True,
4138 inplace: bool_t = False,
4139 kind: str = "quicksort",
4140 na_position: str = "last",
4141 ignore_index: bool_t = False,
4142 ):
4143 """
4144 Sort by the values along either axis.
4146 Parameters
4147 ----------%(optional_by)s
4148 axis : %(axes_single_arg)s, default 0
4149 Axis to be sorted.
4150 ascending : bool or list of bool, default True
4151 Sort ascending vs. descending. Specify list for multiple sort
4152 orders. If this is a list of bools, must match the length of
4153 the by.
4154 inplace : bool, default False
4155 If True, perform operation in-place.
4156 kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
4157 Choice of sorting algorithm. See also ndarray.np.sort for more
4158 information. `mergesort` is the only stable algorithm. For
4159 DataFrames, this option is only applied when sorting on a single
4160 column or label.
4161 na_position : {'first', 'last'}, default 'last'
4162 Puts NaNs at the beginning if `first`; `last` puts NaNs at the
4163 end.
4164 ignore_index : bool, default False
4165 If True, the resulting axis will be labeled 0, 1, …, n - 1.
4167 .. versionadded:: 1.0.0
4169 Returns
4170 -------
4171 sorted_obj : DataFrame or None
4172 DataFrame with sorted values if inplace=False, None otherwise.
4174 Examples
4175 --------
4176 >>> df = pd.DataFrame({
4177 ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
4178 ... 'col2': [2, 1, 9, 8, 7, 4],
4179 ... 'col3': [0, 1, 9, 4, 2, 3],
4180 ... })
4181 >>> df
4182 col1 col2 col3
4183 0 A 2 0
4184 1 A 1 1
4185 2 B 9 9
4186 3 NaN 8 4
4187 4 D 7 2
4188 5 C 4 3
4190 Sort by col1
4192 >>> df.sort_values(by=['col1'])
4193 col1 col2 col3
4194 0 A 2 0
4195 1 A 1 1
4196 2 B 9 9
4197 5 C 4 3
4198 4 D 7 2
4199 3 NaN 8 4
4201 Sort by multiple columns
4203 >>> df.sort_values(by=['col1', 'col2'])
4204 col1 col2 col3
4205 1 A 1 1
4206 0 A 2 0
4207 2 B 9 9
4208 5 C 4 3
4209 4 D 7 2
4210 3 NaN 8 4
4212 Sort Descending
4214 >>> df.sort_values(by='col1', ascending=False)
4215 col1 col2 col3
4216 4 D 7 2
4217 5 C 4 3
4218 2 B 9 9
4219 0 A 2 0
4220 1 A 1 1
4221 3 NaN 8 4
4223 Putting NAs first
4225 >>> df.sort_values(by='col1', ascending=False, na_position='first')
4226 col1 col2 col3
4227 3 NaN 8 4
4228 4 D 7 2
4229 5 C 4 3
4230 2 B 9 9
4231 0 A 2 0
4232 1 A 1 1
4233 """
4234 raise AbstractMethodError(self)
4236 def sort_index(
4237 self,
4238 axis=0,
4239 level=None,
4240 ascending: bool_t = True,
4241 inplace: bool_t = False,
4242 kind: str = "quicksort",
4243 na_position: str = "last",
4244 sort_remaining: bool_t = True,
4245 ignore_index: bool_t = False,
4246 ):
4247 """
4248 Sort object by labels (along an axis).
4250 Parameters
4251 ----------
4252 axis : {0 or 'index', 1 or 'columns'}, default 0
4253 The axis along which to sort. The value 0 identifies the rows,
4254 and 1 identifies the columns.
4255 level : int or level name or list of ints or list of level names
4256 If not None, sort on values in specified index level(s).
4257 ascending : bool, default True
4258 Sort ascending vs. descending.
4259 inplace : bool, default False
4260 If True, perform operation in-place.
4261 kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
4262 Choice of sorting algorithm. See also ndarray.np.sort for more
4263 information. `mergesort` is the only stable algorithm. For
4264 DataFrames, this option is only applied when sorting on a single
4265 column or label.
4266 na_position : {'first', 'last'}, default 'last'
4267 Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
4268 Not implemented for MultiIndex.
4269 sort_remaining : bool, default True
4270 If True and sorting by level and index is multilevel, sort by other
4271 levels too (in order) after sorting by specified level.
4272 ignore_index : bool, default False
4273 If True, the resulting axis will be labeled 0, 1, …, n - 1.
4275 .. versionadded:: 1.0.0
4277 Returns
4278 -------
4279 sorted_obj : DataFrame or None
4280 DataFrame with sorted index if inplace=False, None otherwise.
4281 """
4282 inplace = validate_bool_kwarg(inplace, "inplace")
4283 axis = self._get_axis_number(axis)
4284 axis_name = self._get_axis_name(axis)
4285 labels = self._get_axis(axis)
4287 if level is not None:
4288 raise NotImplementedError("level is not implemented")
4289 if inplace:
4290 raise NotImplementedError("inplace is not implemented")
4292 sort_index = labels.argsort()
4293 if not ascending:
4294 sort_index = sort_index[::-1]
4296 new_axis = labels.take(sort_index)
4297 return self.reindex(**{axis_name: new_axis})
4299 def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries:
4300 """
4301 Conform %(klass)s to new index with optional filling logic.
4303 Places NA/NaN in locations having no value in the previous index. A new object
4304 is produced unless the new index is equivalent to the current one and
4305 ``copy=False``.
4307 Parameters
4308 ----------
4309 %(optional_labels)s
4310 %(axes)s : array-like, optional
4311 New labels / index to conform to, should be specified using
4312 keywords. Preferably an Index object to avoid duplicating data.
4313 %(optional_axis)s
4314 method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
4315 Method to use for filling holes in reindexed DataFrame.
4316 Please note: this is only applicable to DataFrames/Series with a
4317 monotonically increasing/decreasing index.
4319 * None (default): don't fill gaps
4320 * pad / ffill: Propagate last valid observation forward to next
4321 valid.
4322 * backfill / bfill: Use next valid observation to fill gap.
4323 * nearest: Use nearest valid observations to fill gap.
4325 copy : bool, default True
4326 Return a new object, even if the passed indexes are the same.
4327 level : int or name
4328 Broadcast across a level, matching Index values on the
4329 passed MultiIndex level.
4330 fill_value : scalar, default np.NaN
4331 Value to use for missing values. Defaults to NaN, but can be any
4332 "compatible" value.
4333 limit : int, default None
4334 Maximum number of consecutive elements to forward or backward fill.
4335 tolerance : optional
4336 Maximum distance between original and new labels for inexact
4337 matches. The values of the index at the matching locations most
4338 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
4340 Tolerance may be a scalar value, which applies the same tolerance
4341 to all values, or list-like, which applies variable tolerance per
4342 element. List-like includes list, tuple, array, Series, and must be
4343 the same size as the index and its dtype must exactly match the
4344 index's type.
4346 .. versionadded:: 0.21.0 (list-like tolerance)
4348 Returns
4349 -------
4350 %(klass)s with changed index.
4352 See Also
4353 --------
4354 DataFrame.set_index : Set row labels.
4355 DataFrame.reset_index : Remove row labels or move them to new columns.
4356 DataFrame.reindex_like : Change to same indices as other DataFrame.
4358 Examples
4359 --------
4361 ``DataFrame.reindex`` supports two calling conventions
4363 * ``(index=index_labels, columns=column_labels, ...)``
4364 * ``(labels, axis={'index', 'columns'}, ...)``
4366 We *highly* recommend using keyword arguments to clarify your
4367 intent.
4369 Create a dataframe with some fictional data.
4371 >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
4372 >>> df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301],
4373 ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
4374 ... index=index)
4375 >>> df
4376 http_status response_time
4377 Firefox 200 0.04
4378 Chrome 200 0.02
4379 Safari 404 0.07
4380 IE10 404 0.08
4381 Konqueror 301 1.00
4383 Create a new index and reindex the dataframe. By default
4384 values in the new index that do not have corresponding
4385 records in the dataframe are assigned ``NaN``.
4387 >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
4388 ... 'Chrome']
4389 >>> df.reindex(new_index)
4390 http_status response_time
4391 Safari 404.0 0.07
4392 Iceweasel NaN NaN
4393 Comodo Dragon NaN NaN
4394 IE10 404.0 0.08
4395 Chrome 200.0 0.02
4397 We can fill in the missing values by passing a value to
4398 the keyword ``fill_value``. Because the index is not monotonically
4399 increasing or decreasing, we cannot use arguments to the keyword
4400 ``method`` to fill the ``NaN`` values.
4402 >>> df.reindex(new_index, fill_value=0)
4403 http_status response_time
4404 Safari 404 0.07
4405 Iceweasel 0 0.00
4406 Comodo Dragon 0 0.00
4407 IE10 404 0.08
4408 Chrome 200 0.02
4410 >>> df.reindex(new_index, fill_value='missing')
4411 http_status response_time
4412 Safari 404 0.07
4413 Iceweasel missing missing
4414 Comodo Dragon missing missing
4415 IE10 404 0.08
4416 Chrome 200 0.02
4418 We can also reindex the columns.
4420 >>> df.reindex(columns=['http_status', 'user_agent'])
4421 http_status user_agent
4422 Firefox 200 NaN
4423 Chrome 200 NaN
4424 Safari 404 NaN
4425 IE10 404 NaN
4426 Konqueror 301 NaN
4428 Or we can use "axis-style" keyword arguments
4430 >>> df.reindex(['http_status', 'user_agent'], axis="columns")
4431 http_status user_agent
4432 Firefox 200 NaN
4433 Chrome 200 NaN
4434 Safari 404 NaN
4435 IE10 404 NaN
4436 Konqueror 301 NaN
4438 To further illustrate the filling functionality in
4439 ``reindex``, we will create a dataframe with a
4440 monotonically increasing index (for example, a sequence
4441 of dates).
4443 >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
4444 >>> df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},
4445 ... index=date_index)
4446 >>> df2
4447 prices
4448 2010-01-01 100.0
4449 2010-01-02 101.0
4450 2010-01-03 NaN
4451 2010-01-04 100.0
4452 2010-01-05 89.0
4453 2010-01-06 88.0
4455 Suppose we decide to expand the dataframe to cover a wider
4456 date range.
4458 >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
4459 >>> df2.reindex(date_index2)
4460 prices
4461 2009-12-29 NaN
4462 2009-12-30 NaN
4463 2009-12-31 NaN
4464 2010-01-01 100.0
4465 2010-01-02 101.0
4466 2010-01-03 NaN
4467 2010-01-04 100.0
4468 2010-01-05 89.0
4469 2010-01-06 88.0
4470 2010-01-07 NaN
4472 The index entries that did not have a value in the original data frame
4473 (for example, '2009-12-29') are by default filled with ``NaN``.
4474 If desired, we can fill in the missing values using one of several
4475 options.
4477 For example, to back-propagate the last valid value to fill the ``NaN``
4478 values, pass ``bfill`` as an argument to the ``method`` keyword.
4480 >>> df2.reindex(date_index2, method='bfill')
4481 prices
4482 2009-12-29 100.0
4483 2009-12-30 100.0
4484 2009-12-31 100.0
4485 2010-01-01 100.0
4486 2010-01-02 101.0
4487 2010-01-03 NaN
4488 2010-01-04 100.0
4489 2010-01-05 89.0
4490 2010-01-06 88.0
4491 2010-01-07 NaN
4493 Please note that the ``NaN`` value present in the original dataframe
4494 (at index value 2010-01-03) will not be filled by any of the
4495 value propagation schemes. This is because filling while reindexing
4496 does not look at dataframe values, but only compares the original and
4497 desired indexes. If you do want to fill in the ``NaN`` values present
4498 in the original dataframe, use the ``fillna()`` method.
4500 See the :ref:`user guide <basics.reindexing>` for more.
4501 """
4502 # TODO: Decide if we care about having different examples for different
4503 # kinds
4505 # construct the args
4506 axes, kwargs = self._construct_axes_from_arguments(args, kwargs)
4507 method = missing.clean_reindex_fill_method(kwargs.pop("method", None))
4508 level = kwargs.pop("level", None)
4509 copy = kwargs.pop("copy", True)
4510 limit = kwargs.pop("limit", None)
4511 tolerance = kwargs.pop("tolerance", None)
4512 fill_value = kwargs.pop("fill_value", None)
4514 # Series.reindex doesn't use / need the axis kwarg
4515 # We pop and ignore it here, to make writing Series/Frame generic code
4516 # easier
4517 kwargs.pop("axis", None)
4519 if kwargs:
4520 raise TypeError(
4521 "reindex() got an unexpected keyword "
4522 f'argument "{list(kwargs.keys())[0]}"'
4523 )
4525 self._consolidate_inplace()
4527 # if all axes that are requested to reindex are equal, then only copy
4528 # if indicated must have index names equal here as well as values
4529 if all(
4530 self._get_axis(axis).identical(ax)
4531 for axis, ax in axes.items()
4532 if ax is not None
4533 ):
4534 if copy:
4535 return self.copy()
4536 return self
4538 # check if we are a multi reindex
4539 if self._needs_reindex_multi(axes, method, level):
4540 return self._reindex_multi(axes, copy, fill_value)
4542 # perform the reindex on the axes
4543 return self._reindex_axes(
4544 axes, level, limit, tolerance, method, fill_value, copy
4545 ).__finalize__(self)
4547 def _reindex_axes(
4548 self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy
4549 ) -> FrameOrSeries:
4550 """Perform the reindex for all the axes."""
4551 obj = self
4552 for a in self._AXIS_ORDERS:
4553 labels = axes[a]
4554 if labels is None:
4555 continue
4557 ax = self._get_axis(a)
4558 new_index, indexer = ax.reindex(
4559 labels, level=level, limit=limit, tolerance=tolerance, method=method
4560 )
4562 axis = self._get_axis_number(a)
4563 obj = obj._reindex_with_indexers(
4564 {axis: [new_index, indexer]},
4565 fill_value=fill_value,
4566 copy=copy,
4567 allow_dups=False,
4568 )
4570 return obj
4572 def _needs_reindex_multi(self, axes, method, level) -> bool_t:
4573 """Check if we do need a multi reindex."""
4574 return (
4575 (com.count_not_none(*axes.values()) == self._AXIS_LEN)
4576 and method is None
4577 and level is None
4578 and not self._is_mixed_type
4579 )
4581 def _reindex_multi(self, axes, copy, fill_value):
4582 raise AbstractMethodError(self)
4584 def _reindex_with_indexers(
4585 self: FrameOrSeries,
4586 reindexers,
4587 fill_value=None,
4588 copy: bool_t = False,
4589 allow_dups: bool_t = False,
4590 ) -> FrameOrSeries:
4591 """allow_dups indicates an internal call here """
4593 # reindex doing multiple operations on different axes if indicated
4594 new_data = self._data
4595 for axis in sorted(reindexers.keys()):
4596 index, indexer = reindexers[axis]
4597 baxis = self._get_block_manager_axis(axis)
4599 if index is None:
4600 continue
4602 index = ensure_index(index)
4603 if indexer is not None:
4604 indexer = ensure_int64(indexer)
4606 # TODO: speed up on homogeneous DataFrame objects
4607 new_data = new_data.reindex_indexer(
4608 index,
4609 indexer,
4610 axis=baxis,
4611 fill_value=fill_value,
4612 allow_dups=allow_dups,
4613 copy=copy,
4614 )
4616 if copy and new_data is self._data:
4617 new_data = new_data.copy()
4619 return self._constructor(new_data).__finalize__(self)
4621 def filter(
4622 self: FrameOrSeries,
4623 items=None,
4624 like: Optional[str] = None,
4625 regex: Optional[str] = None,
4626 axis=None,
4627 ) -> FrameOrSeries:
4628 """
4629 Subset the dataframe rows or columns according to the specified index labels.
4631 Note that this routine does not filter a dataframe on its
4632 contents. The filter is applied to the labels of the index.
4634 Parameters
4635 ----------
4636 items : list-like
4637 Keep labels from axis which are in items.
4638 like : str
4639 Keep labels from axis for which "like in label == True".
4640 regex : str (regular expression)
4641 Keep labels from axis for which re.search(regex, label) == True.
4642 axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
4643 The axis to filter on, expressed either as an index (int)
4644 or axis name (str). By default this is the info axis,
4645 'index' for Series, 'columns' for DataFrame.
4647 Returns
4648 -------
4649 same type as input object
4651 See Also
4652 --------
4653 DataFrame.loc
4655 Notes
4656 -----
4657 The ``items``, ``like``, and ``regex`` parameters are
4658 enforced to be mutually exclusive.
4660 ``axis`` defaults to the info axis that is used when indexing
4661 with ``[]``.
4663 Examples
4664 --------
4665 >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
4666 ... index=['mouse', 'rabbit'],
4667 ... columns=['one', 'two', 'three'])
4669 >>> # select columns by name
4670 >>> df.filter(items=['one', 'three'])
4671 one three
4672 mouse 1 3
4673 rabbit 4 6
4675 >>> # select columns by regular expression
4676 >>> df.filter(regex='e$', axis=1)
4677 one three
4678 mouse 1 3
4679 rabbit 4 6
4681 >>> # select rows containing 'bbi'
4682 >>> df.filter(like='bbi', axis=0)
4683 one two three
4684 rabbit 4 5 6
4685 """
4686 nkw = com.count_not_none(items, like, regex)
4687 if nkw > 1:
4688 raise TypeError(
4689 "Keyword arguments `items`, `like`, or `regex` "
4690 "are mutually exclusive"
4691 )
4693 if axis is None:
4694 axis = self._info_axis_name
4695 labels = self._get_axis(axis)
4697 if items is not None:
4698 name = self._get_axis_name(axis)
4699 return self.reindex(**{name: [r for r in items if r in labels]})
4700 elif like:
4702 def f(x):
4703 return like in ensure_str(x)
4705 values = labels.map(f)
4706 return self.loc(axis=axis)[values]
4707 elif regex:
4709 def f(x):
4710 return matcher.search(ensure_str(x)) is not None
4712 matcher = re.compile(regex)
4713 values = labels.map(f)
4714 return self.loc(axis=axis)[values]
4715 else:
4716 raise TypeError("Must pass either `items`, `like`, or `regex`")
4718 def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries:
4719 """
4720 Return the first `n` rows.
4722 This function returns the first `n` rows for the object based
4723 on position. It is useful for quickly testing if your object
4724 has the right type of data in it.
4726 For negative values of `n`, this function returns all rows except
4727 the last `n` rows, equivalent to ``df[:-n]``.
4729 Parameters
4730 ----------
4731 n : int, default 5
4732 Number of rows to select.
4734 Returns
4735 -------
4736 same type as caller
4737 The first `n` rows of the caller object.
4739 See Also
4740 --------
4741 DataFrame.tail: Returns the last `n` rows.
4743 Examples
4744 --------
4745 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
4746 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
4747 >>> df
4748 animal
4749 0 alligator
4750 1 bee
4751 2 falcon
4752 3 lion
4753 4 monkey
4754 5 parrot
4755 6 shark
4756 7 whale
4757 8 zebra
4759 Viewing the first 5 lines
4761 >>> df.head()
4762 animal
4763 0 alligator
4764 1 bee
4765 2 falcon
4766 3 lion
4767 4 monkey
4769 Viewing the first `n` lines (three in this case)
4771 >>> df.head(3)
4772 animal
4773 0 alligator
4774 1 bee
4775 2 falcon
4777 For negative values of `n`
4779 >>> df.head(-3)
4780 animal
4781 0 alligator
4782 1 bee
4783 2 falcon
4784 3 lion
4785 4 monkey
4786 5 parrot
4787 """
4789 return self.iloc[:n]
4791 def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries:
4792 """
4793 Return the last `n` rows.
4795 This function returns last `n` rows from the object based on
4796 position. It is useful for quickly verifying data, for example,
4797 after sorting or appending rows.
4799 For negative values of `n`, this function returns all rows except
4800 the first `n` rows, equivalent to ``df[n:]``.
4802 Parameters
4803 ----------
4804 n : int, default 5
4805 Number of rows to select.
4807 Returns
4808 -------
4809 type of caller
4810 The last `n` rows of the caller object.
4812 See Also
4813 --------
4814 DataFrame.head : The first `n` rows of the caller object.
4816 Examples
4817 --------
4818 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
4819 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
4820 >>> df
4821 animal
4822 0 alligator
4823 1 bee
4824 2 falcon
4825 3 lion
4826 4 monkey
4827 5 parrot
4828 6 shark
4829 7 whale
4830 8 zebra
4832 Viewing the last 5 lines
4834 >>> df.tail()
4835 animal
4836 4 monkey
4837 5 parrot
4838 6 shark
4839 7 whale
4840 8 zebra
4842 Viewing the last `n` lines (three in this case)
4844 >>> df.tail(3)
4845 animal
4846 6 shark
4847 7 whale
4848 8 zebra
4850 For negative values of `n`
4852 >>> df.tail(-3)
4853 animal
4854 3 lion
4855 4 monkey
4856 5 parrot
4857 6 shark
4858 7 whale
4859 8 zebra
4860 """
4862 if n == 0:
4863 return self.iloc[0:0]
4864 return self.iloc[-n:]
4866 def sample(
4867 self: FrameOrSeries,
4868 n=None,
4869 frac=None,
4870 replace=False,
4871 weights=None,
4872 random_state=None,
4873 axis=None,
4874 ) -> FrameOrSeries:
4875 """
4876 Return a random sample of items from an axis of object.
4878 You can use `random_state` for reproducibility.
4880 Parameters
4881 ----------
4882 n : int, optional
4883 Number of items from axis to return. Cannot be used with `frac`.
4884 Default = 1 if `frac` = None.
4885 frac : float, optional
4886 Fraction of axis items to return. Cannot be used with `n`.
4887 replace : bool, default False
4888 Allow or disallow sampling of the same row more than once.
4889 weights : str or ndarray-like, optional
4890 Default 'None' results in equal probability weighting.
4891 If passed a Series, will align with target object on index. Index
4892 values in weights not found in sampled object will be ignored and
4893 index values in sampled object not in weights will be assigned
4894 weights of zero.
4895 If called on a DataFrame, will accept the name of a column
4896 when axis = 0.
4897 Unless weights are a Series, weights must be same length as axis
4898 being sampled.
4899 If weights do not sum to 1, they will be normalized to sum to 1.
4900 Missing values in the weights column will be treated as zero.
4901 Infinite values not allowed.
4902 random_state : int or numpy.random.RandomState, optional
4903 Seed for the random number generator (if int), or numpy RandomState
4904 object.
4905 axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
4906 Axis to sample. Accepts axis number or name. Default is stat axis
4907 for given data type (0 for Series and DataFrames).
4909 Returns
4910 -------
4911 Series or DataFrame
4912 A new object of same type as caller containing `n` items randomly
4913 sampled from the caller object.
4915 See Also
4916 --------
4917 numpy.random.choice: Generates a random sample from a given 1-D numpy
4918 array.
4920 Notes
4921 -----
4922 If `frac` > 1, `replacement` should be set to `True`.
4924 Examples
4925 --------
4926 >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
4927 ... 'num_wings': [2, 0, 0, 0],
4928 ... 'num_specimen_seen': [10, 2, 1, 8]},
4929 ... index=['falcon', 'dog', 'spider', 'fish'])
4930 >>> df
4931 num_legs num_wings num_specimen_seen
4932 falcon 2 2 10
4933 dog 4 0 2
4934 spider 8 0 1
4935 fish 0 0 8
4937 Extract 3 random elements from the ``Series`` ``df['num_legs']``:
4938 Note that we use `random_state` to ensure the reproducibility of
4939 the examples.
4941 >>> df['num_legs'].sample(n=3, random_state=1)
4942 fish 0
4943 spider 8
4944 falcon 2
4945 Name: num_legs, dtype: int64
4947 A random 50% sample of the ``DataFrame`` with replacement:
4949 >>> df.sample(frac=0.5, replace=True, random_state=1)
4950 num_legs num_wings num_specimen_seen
4951 dog 4 0 2
4952 fish 0 0 8
4954 An upsample sample of the ``DataFrame`` with replacement:
4955 Note that `replace` parameter has to be `True` for `frac` parameter > 1.
4957 >>> df.sample(frac=2, replace=True, random_state=1)
4958 num_legs num_wings num_specimen_seen
4959 dog 4 0 2
4960 fish 0 0 8
4961 falcon 2 2 10
4962 falcon 2 2 10
4963 fish 0 0 8
4964 dog 4 0 2
4965 fish 0 0 8
4966 dog 4 0 2
4968 Using a DataFrame column as weights. Rows with larger value in the
4969 `num_specimen_seen` column are more likely to be sampled.
4971 >>> df.sample(n=2, weights='num_specimen_seen', random_state=1)
4972 num_legs num_wings num_specimen_seen
4973 falcon 2 2 10
4974 fish 0 0 8
4975 """
4977 if axis is None:
4978 axis = self._stat_axis_number
4980 axis = self._get_axis_number(axis)
4981 axis_length = self.shape[axis]
4983 # Process random_state argument
4984 rs = com.random_state(random_state)
4986 # Check weights for compliance
4987 if weights is not None:
4989 # If a series, align with frame
4990 if isinstance(weights, ABCSeries):
4991 weights = weights.reindex(self.axes[axis])
4993 # Strings acceptable if a dataframe and axis = 0
4994 if isinstance(weights, str):
4995 if isinstance(self, ABCDataFrame):
4996 if axis == 0:
4997 try:
4998 weights = self[weights]
4999 except KeyError:
5000 raise KeyError(
5001 "String passed to weights not a valid column"
5002 )
5003 else:
5004 raise ValueError(
5005 "Strings can only be passed to "
5006 "weights when sampling from rows on "
5007 "a DataFrame"
5008 )
5009 else:
5010 raise ValueError(
5011 "Strings cannot be passed as weights "
5012 "when sampling from a Series."
5013 )
5015 weights = pd.Series(weights, dtype="float64")
5017 if len(weights) != axis_length:
5018 raise ValueError(
5019 "Weights and axis to be sampled must be of same length"
5020 )
5022 if (weights == np.inf).any() or (weights == -np.inf).any():
5023 raise ValueError("weight vector may not include `inf` values")
5025 if (weights < 0).any():
5026 raise ValueError("weight vector many not include negative values")
5028 # If has nan, set to zero.
5029 weights = weights.fillna(0)
5031 # Renormalize if don't sum to 1
5032 if weights.sum() != 1:
5033 if weights.sum() != 0:
5034 weights = weights / weights.sum()
5035 else:
5036 raise ValueError("Invalid weights: weights sum to zero")
5038 weights = weights.values
5040 # If no frac or n, default to n=1.
5041 if n is None and frac is None:
5042 n = 1
5043 elif frac is not None and frac > 1 and not replace:
5044 raise ValueError(
5045 "Replace has to be set to `True` when "
5046 "upsampling the population `frac` > 1."
5047 )
5048 elif n is not None and frac is None and n % 1 != 0:
5049 raise ValueError("Only integers accepted as `n` values")
5050 elif n is None and frac is not None:
5051 n = int(round(frac * axis_length))
5052 elif n is not None and frac is not None:
5053 raise ValueError("Please enter a value for `frac` OR `n`, not both")
5055 # Check for negative sizes
5056 if n < 0:
5057 raise ValueError(
5058 "A negative number of rows requested. Please provide positive value."
5059 )
5061 locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
5062 return self.take(locs, axis=axis)
5064 _shared_docs[
5065 "pipe"
5066 ] = r"""
5067 Apply func(self, \*args, \*\*kwargs).
5069 Parameters
5070 ----------
5071 func : function
5072 Function to apply to the %(klass)s.
5073 ``args``, and ``kwargs`` are passed into ``func``.
5074 Alternatively a ``(callable, data_keyword)`` tuple where
5075 ``data_keyword`` is a string indicating the keyword of
5076 ``callable`` that expects the %(klass)s.
5077 args : iterable, optional
5078 Positional arguments passed into ``func``.
5079 kwargs : mapping, optional
5080 A dictionary of keyword arguments passed into ``func``.
5082 Returns
5083 -------
5084 object : the return type of ``func``.
5086 See Also
5087 --------
5088 DataFrame.apply
5089 DataFrame.applymap
5090 Series.map
5092 Notes
5093 -----
5095 Use ``.pipe`` when chaining together functions that expect
5096 Series, DataFrames or GroupBy objects. Instead of writing
5098 >>> f(g(h(df), arg1=a), arg2=b, arg3=c)
5100 You can write
5102 >>> (df.pipe(h)
5103 ... .pipe(g, arg1=a)
5104 ... .pipe(f, arg2=b, arg3=c)
5105 ... )
5107 If you have a function that takes the data as (say) the second
5108 argument, pass a tuple indicating which keyword expects the
5109 data. For example, suppose ``f`` takes its data as ``arg2``:
5111 >>> (df.pipe(h)
5112 ... .pipe(g, arg1=a)
5113 ... .pipe((f, 'arg2'), arg1=a, arg3=c)
5114 ... )
5115 """
5117 @Appender(_shared_docs["pipe"] % _shared_doc_kwargs)
5118 def pipe(self, func, *args, **kwargs):
5119 return com.pipe(self, func, *args, **kwargs)
5121 _shared_docs["aggregate"] = dedent(
5122 """
5123 Aggregate using one or more operations over the specified axis.
5124 %(versionadded)s
5125 Parameters
5126 ----------
5127 func : function, str, list or dict
5128 Function to use for aggregating the data. If a function, must either
5129 work when passed a %(klass)s or when passed to %(klass)s.apply.
5131 Accepted combinations are:
5133 - function
5134 - string function name
5135 - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
5136 - dict of axis labels -> functions, function names or list of such.
5137 %(axis)s
5138 *args
5139 Positional arguments to pass to `func`.
5140 **kwargs
5141 Keyword arguments to pass to `func`.
5143 Returns
5144 -------
5145 scalar, Series or DataFrame
5147 The return can be:
5149 * scalar : when Series.agg is called with single function
5150 * Series : when DataFrame.agg is called with a single function
5151 * DataFrame : when DataFrame.agg is called with several functions
5153 Return scalar, Series or DataFrame.
5154 %(see_also)s
5155 Notes
5156 -----
5157 `agg` is an alias for `aggregate`. Use the alias.
5159 A passed user-defined-function will be passed a Series for evaluation.
5160 %(examples)s"""
5161 )
5163 _shared_docs[
5164 "transform"
5165 ] = """
5166 Call ``func`` on self producing a %(klass)s with transformed values.
5168 Produced %(klass)s will have same axis length as self.
5170 Parameters
5171 ----------
5172 func : function, str, list or dict
5173 Function to use for transforming the data. If a function, must either
5174 work when passed a %(klass)s or when passed to %(klass)s.apply.
5176 Accepted combinations are:
5178 - function
5179 - string function name
5180 - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']``
5181 - dict of axis labels -> functions, function names or list of such.
5182 %(axis)s
5183 *args
5184 Positional arguments to pass to `func`.
5185 **kwargs
5186 Keyword arguments to pass to `func`.
5188 Returns
5189 -------
5190 %(klass)s
5191 A %(klass)s that must have the same length as self.
5193 Raises
5194 ------
5195 ValueError : If the returned %(klass)s has a different length than self.
5197 See Also
5198 --------
5199 %(klass)s.agg : Only perform aggregating type operations.
5200 %(klass)s.apply : Invoke function on a %(klass)s.
5202 Examples
5203 --------
5204 >>> df = pd.DataFrame({'A': range(3), 'B': range(1, 4)})
5205 >>> df
5206 A B
5207 0 0 1
5208 1 1 2
5209 2 2 3
5210 >>> df.transform(lambda x: x + 1)
5211 A B
5212 0 1 2
5213 1 2 3
5214 2 3 4
5216 Even though the resulting %(klass)s must have the same length as the
5217 input %(klass)s, it is possible to provide several input functions:
5219 >>> s = pd.Series(range(3))
5220 >>> s
5221 0 0
5222 1 1
5223 2 2
5224 dtype: int64
5225 >>> s.transform([np.sqrt, np.exp])
5226 sqrt exp
5227 0 0.000000 1.000000
5228 1 1.000000 2.718282
5229 2 1.414214 7.389056
5230 """
5232 # ----------------------------------------------------------------------
5233 # Attribute access
5235 def __finalize__(
5236 self: FrameOrSeries, other, method=None, **kwargs
5237 ) -> FrameOrSeries:
5238 """
5239 Propagate metadata from other to self.
5241 Parameters
5242 ----------
5243 other : the object from which to get the attributes that we are going
5244 to propagate
5245 method : optional, a passed method name ; possibly to take different
5246 types of propagation actions based on this
5248 """
5249 if isinstance(other, NDFrame):
5250 for name in other.attrs:
5251 self.attrs[name] = other.attrs[name]
5252 # For subclasses using _metadata.
5253 for name in self._metadata:
5254 object.__setattr__(self, name, getattr(other, name, None))
5255 return self
5257 def __getattr__(self, name: str):
5258 """After regular attribute access, try looking up the name
5259 This allows simpler access to columns for interactive use.
5260 """
5262 # Note: obj.x will always call obj.__getattribute__('x') prior to
5263 # calling obj.__getattr__('x').
5265 if (
5266 name in self._internal_names_set
5267 or name in self._metadata
5268 or name in self._accessors
5269 ):
5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5273 return self[name]
5274 return object.__getattribute__(self, name)
5276 def __setattr__(self, name: str, value) -> None:
5277 """After regular attribute access, try setting the name
5278 This allows simpler access to columns for interactive use.
5279 """
5281 # first try regular attribute access via __getattribute__, so that
5282 # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify
5283 # the same attribute.
5285 try:
5286 object.__getattribute__(self, name)
5287 return object.__setattr__(self, name, value)
5288 except AttributeError:
5289 pass
5291 # if this fails, go on to more involved attribute setting
5292 # (note that this matches __getattr__, above).
5293 if name in self._internal_names_set:
5294 object.__setattr__(self, name, value)
5295 elif name in self._metadata:
5296 object.__setattr__(self, name, value)
5297 else:
5298 try:
5299 existing = getattr(self, name)
5300 if isinstance(existing, Index):
5301 object.__setattr__(self, name, value)
5302 elif name in self._info_axis:
5303 self[name] = value
5304 else:
5305 object.__setattr__(self, name, value)
5306 except (AttributeError, TypeError):
5307 if isinstance(self, ABCDataFrame) and (is_list_like(value)):
5308 warnings.warn(
5309 "Pandas doesn't allow columns to be "
5310 "created via a new attribute name - see "
5311 "https://pandas.pydata.org/pandas-docs/"
5312 "stable/indexing.html#attribute-access",
5313 stacklevel=2,
5314 )
5315 object.__setattr__(self, name, value)
5317 def _dir_additions(self):
5318 """ add the string-like attributes from the info_axis.
5319 If info_axis is a MultiIndex, it's first level values are used.
5320 """
5321 additions = {
5322 c
5323 for c in self._info_axis.unique(level=0)[:100]
5324 if isinstance(c, str) and c.isidentifier()
5325 }
5326 return super()._dir_additions().union(additions)
5328 # ----------------------------------------------------------------------
5329 # Consolidation of internals
5331 def _protect_consolidate(self, f):
5332 """Consolidate _data -- if the blocks have changed, then clear the
5333 cache
5334 """
5335 blocks_before = len(self._data.blocks)
5336 result = f()
5337 if len(self._data.blocks) != blocks_before:
5338 self._clear_item_cache()
5339 return result
5341 def _consolidate_inplace(self) -> None:
5342 """Consolidate data in place and return None"""
5344 def f():
5345 self._data = self._data.consolidate()
5347 self._protect_consolidate(f)
5349 def _consolidate(self, inplace: bool_t = False):
5350 """
5351 Compute NDFrame with "consolidated" internals (data of each dtype
5352 grouped together in a single ndarray).
5354 Parameters
5355 ----------
5356 inplace : bool, default False
5357 If False return new object, otherwise modify existing object.
5359 Returns
5360 -------
5361 consolidated : same type as caller
5362 """
5363 inplace = validate_bool_kwarg(inplace, "inplace")
5364 if inplace:
5365 self._consolidate_inplace()
5366 else:
5367 f = lambda: self._data.consolidate()
5368 cons_data = self._protect_consolidate(f)
5369 return self._constructor(cons_data).__finalize__(self)
5371 @property
5372 def _is_mixed_type(self):
5373 f = lambda: self._data.is_mixed_type
5374 return self._protect_consolidate(f)
5376 @property
5377 def _is_numeric_mixed_type(self):
5378 f = lambda: self._data.is_numeric_mixed_type
5379 return self._protect_consolidate(f)
5381 @property
5382 def _is_datelike_mixed_type(self):
5383 f = lambda: self._data.is_datelike_mixed_type
5384 return self._protect_consolidate(f)
5386 def _check_inplace_setting(self, value) -> bool_t:
5387 """ check whether we allow in-place setting with this type of value """
5389 if self._is_mixed_type:
5390 if not self._is_numeric_mixed_type:
5392 # allow an actual np.nan thru
5393 if is_float(value) and np.isnan(value):
5394 return True
5396 raise TypeError(
5397 "Cannot do inplace boolean setting on "
5398 "mixed-types with a non np.nan value"
5399 )
5401 return True
5403 def _get_numeric_data(self):
5404 return self._constructor(self._data.get_numeric_data()).__finalize__(self)
5406 def _get_bool_data(self):
5407 return self._constructor(self._data.get_bool_data()).__finalize__(self)
5409 # ----------------------------------------------------------------------
5410 # Internal Interface Methods
5412 @property
5413 def values(self) -> np.ndarray:
5414 """
5415 Return a Numpy representation of the DataFrame.
5417 .. warning::
5419 We recommend using :meth:`DataFrame.to_numpy` instead.
5421 Only the values in the DataFrame will be returned, the axes labels
5422 will be removed.
5424 Returns
5425 -------
5426 numpy.ndarray
5427 The values of the DataFrame.
5429 See Also
5430 --------
5431 DataFrame.to_numpy : Recommended alternative to this method.
5432 DataFrame.index : Retrieve the index labels.
5433 DataFrame.columns : Retrieving the column names.
5435 Notes
5436 -----
5437 The dtype will be a lower-common-denominator dtype (implicit
5438 upcasting); that is to say if the dtypes (even of numeric types)
5439 are mixed, the one that accommodates all will be chosen. Use this
5440 with care if you are not dealing with the blocks.
5442 e.g. If the dtypes are float16 and float32, dtype will be upcast to
5443 float32. If dtypes are int32 and uint8, dtype will be upcast to
5444 int32. By :func:`numpy.find_common_type` convention, mixing int64
5445 and uint64 will result in a float64 dtype.
5447 Examples
5448 --------
5449 A DataFrame where all columns are the same type (e.g., int64) results
5450 in an array of the same type.
5452 >>> df = pd.DataFrame({'age': [ 3, 29],
5453 ... 'height': [94, 170],
5454 ... 'weight': [31, 115]})
5455 >>> df
5456 age height weight
5457 0 3 94 31
5458 1 29 170 115
5459 >>> df.dtypes
5460 age int64
5461 height int64
5462 weight int64
5463 dtype: object
5464 >>> df.values
5465 array([[ 3, 94, 31],
5466 [ 29, 170, 115]], dtype=int64)
5468 A DataFrame with mixed type columns(e.g., str/object, int64, float32)
5469 results in an ndarray of the broadest type that accommodates these
5470 mixed types (e.g., object).
5472 >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),
5473 ... ('lion', 80.5, 1),
5474 ... ('monkey', np.nan, None)],
5475 ... columns=('name', 'max_speed', 'rank'))
5476 >>> df2.dtypes
5477 name object
5478 max_speed float64
5479 rank object
5480 dtype: object
5481 >>> df2.values
5482 array([['parrot', 24.0, 'second'],
5483 ['lion', 80.5, 1],
5484 ['monkey', nan, None]], dtype=object)
5485 """
5486 self._consolidate_inplace()
5487 return self._data.as_array(transpose=self._AXIS_REVERSED)
5489 @property
5490 def _values(self) -> np.ndarray:
5491 """internal implementation"""
5492 return self.values
5494 @property
5495 def _get_values(self) -> np.ndarray:
5496 # compat
5497 return self.values
5499 def _internal_get_values(self) -> np.ndarray:
5500 """
5501 Return an ndarray after converting sparse values to dense.
5503 This is the same as ``.values`` for non-sparse data. For sparse
5504 data contained in a `SparseArray`, the data are first
5505 converted to a dense representation.
5507 Returns
5508 -------
5509 numpy.ndarray
5510 Numpy representation of DataFrame.
5512 See Also
5513 --------
5514 values : Numpy representation of DataFrame.
5515 SparseArray : Container for sparse data.
5516 """
5517 return self.values
5519 @property
5520 def dtypes(self):
5521 """
5522 Return the dtypes in the DataFrame.
5524 This returns a Series with the data type of each column.
5525 The result's index is the original DataFrame's columns. Columns
5526 with mixed types are stored with the ``object`` dtype. See
5527 :ref:`the User Guide <basics.dtypes>` for more.
5529 Returns
5530 -------
5531 pandas.Series
5532 The data type of each column.
5534 Examples
5535 --------
5536 >>> df = pd.DataFrame({'float': [1.0],
5537 ... 'int': [1],
5538 ... 'datetime': [pd.Timestamp('20180310')],
5539 ... 'string': ['foo']})
5540 >>> df.dtypes
5541 float float64
5542 int int64
5543 datetime datetime64[ns]
5544 string object
5545 dtype: object
5546 """
5547 from pandas import Series
5549 return Series(self._data.get_dtypes(), index=self._info_axis, dtype=np.object_)
5551 def _to_dict_of_blocks(self, copy: bool_t = True):
5552 """
5553 Return a dict of dtype -> Constructor Types that
5554 each is a homogeneous dtype.
5556 Internal ONLY
5557 """
5558 return {
5559 k: self._constructor(v).__finalize__(self)
5560 for k, v, in self._data.to_dict(copy=copy).items()
5561 }
5563 def astype(
5564 self: FrameOrSeries, dtype, copy: bool_t = True, errors: str = "raise"
5565 ) -> FrameOrSeries:
5566 """
5567 Cast a pandas object to a specified dtype ``dtype``.
5569 Parameters
5570 ----------
5571 dtype : data type, or dict of column name -> data type
5572 Use a numpy.dtype or Python type to cast entire pandas object to
5573 the same type. Alternatively, use {col: dtype, ...}, where col is a
5574 column label and dtype is a numpy.dtype or Python type to cast one
5575 or more of the DataFrame's columns to column-specific types.
5576 copy : bool, default True
5577 Return a copy when ``copy=True`` (be very careful setting
5578 ``copy=False`` as changes to values then may propagate to other
5579 pandas objects).
5580 errors : {'raise', 'ignore'}, default 'raise'
5581 Control raising of exceptions on invalid data for provided dtype.
5583 - ``raise`` : allow exceptions to be raised
5584 - ``ignore`` : suppress exceptions. On error return original object.
5586 Returns
5587 -------
5588 casted : same type as caller
5590 See Also
5591 --------
5592 to_datetime : Convert argument to datetime.
5593 to_timedelta : Convert argument to timedelta.
5594 to_numeric : Convert argument to a numeric type.
5595 numpy.ndarray.astype : Cast a numpy array to a specified type.
5597 Examples
5598 --------
5599 Create a DataFrame:
5601 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
5602 >>> df = pd.DataFrame(data=d)
5603 >>> df.dtypes
5604 col1 int64
5605 col2 int64
5606 dtype: object
5608 Cast all columns to int32:
5610 >>> df.astype('int32').dtypes
5611 col1 int32
5612 col2 int32
5613 dtype: object
5615 Cast col1 to int32 using a dictionary:
5617 >>> df.astype({'col1': 'int32'}).dtypes
5618 col1 int32
5619 col2 int64
5620 dtype: object
5622 Create a series:
5624 >>> ser = pd.Series([1, 2], dtype='int32')
5625 >>> ser
5626 0 1
5627 1 2
5628 dtype: int32
5629 >>> ser.astype('int64')
5630 0 1
5631 1 2
5632 dtype: int64
5634 Convert to categorical type:
5636 >>> ser.astype('category')
5637 0 1
5638 1 2
5639 dtype: category
5640 Categories (2, int64): [1, 2]
5642 Convert to ordered categorical type with custom ordering:
5644 >>> cat_dtype = pd.api.types.CategoricalDtype(
5645 ... categories=[2, 1], ordered=True)
5646 >>> ser.astype(cat_dtype)
5647 0 1
5648 1 2
5649 dtype: category
5650 Categories (2, int64): [2 < 1]
5652 Note that using ``copy=False`` and changing data on a new
5653 pandas object may propagate changes:
5655 >>> s1 = pd.Series([1, 2])
5656 >>> s2 = s1.astype('int64', copy=False)
5657 >>> s2[0] = 10
5658 >>> s1 # note that s1[0] has changed too
5659 0 10
5660 1 2
5661 dtype: int64
5662 """
5663 if is_dict_like(dtype):
5664 if self.ndim == 1: # i.e. Series
5665 if len(dtype) > 1 or self.name not in dtype:
5666 raise KeyError(
5667 "Only the Series name can be used for "
5668 "the key in Series dtype mappings."
5669 )
5670 new_type = dtype[self.name]
5671 return self.astype(new_type, copy, errors)
5673 for col_name in dtype.keys():
5674 if col_name not in self:
5675 raise KeyError(
5676 "Only a column name can be used for the "
5677 "key in a dtype mappings argument."
5678 )
5679 results = []
5680 for col_name, col in self.items():
5681 if col_name in dtype:
5682 results.append(
5683 col.astype(dtype=dtype[col_name], copy=copy, errors=errors)
5684 )
5685 else:
5686 results.append(col.copy() if copy else col)
5688 elif is_extension_array_dtype(dtype) and self.ndim > 1:
5689 # GH 18099/22869: columnwise conversion to extension dtype
5690 # GH 24704: use iloc to handle duplicate column names
5691 results = [
5692 self.iloc[:, i].astype(dtype, copy=copy)
5693 for i in range(len(self.columns))
5694 ]
5696 else:
5697 # else, only a single dtype is given
5698 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors)
5699 return self._constructor(new_data).__finalize__(self)
5701 # GH 19920: retain column metadata after concat
5702 result = pd.concat(results, axis=1, copy=False)
5703 result.columns = self.columns
5704 return result
5706 def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
5707 """
5708 Make a copy of this object's indices and data.
5710 When ``deep=True`` (default), a new object will be created with a
5711 copy of the calling object's data and indices. Modifications to
5712 the data or indices of the copy will not be reflected in the
5713 original object (see notes below).
5715 When ``deep=False``, a new object will be created without copying
5716 the calling object's data or index (only references to the data
5717 and index are copied). Any changes to the data of the original
5718 will be reflected in the shallow copy (and vice versa).
5720 Parameters
5721 ----------
5722 deep : bool, default True
5723 Make a deep copy, including a copy of the data and the indices.
5724 With ``deep=False`` neither the indices nor the data are copied.
5726 Returns
5727 -------
5728 copy : Series or DataFrame
5729 Object type matches caller.
5731 Notes
5732 -----
5733 When ``deep=True``, data is copied but actual Python objects
5734 will not be copied recursively, only the reference to the object.
5735 This is in contrast to `copy.deepcopy` in the Standard Library,
5736 which recursively copies object data (see examples below).
5738 While ``Index`` objects are copied when ``deep=True``, the underlying
5739 numpy array is not copied for performance reasons. Since ``Index`` is
5740 immutable, the underlying data can be safely shared and a copy
5741 is not needed.
5743 Examples
5744 --------
5745 >>> s = pd.Series([1, 2], index=["a", "b"])
5746 >>> s
5747 a 1
5748 b 2
5749 dtype: int64
5751 >>> s_copy = s.copy()
5752 >>> s_copy
5753 a 1
5754 b 2
5755 dtype: int64
5757 **Shallow copy versus default (deep) copy:**
5759 >>> s = pd.Series([1, 2], index=["a", "b"])
5760 >>> deep = s.copy()
5761 >>> shallow = s.copy(deep=False)
5763 Shallow copy shares data and index with original.
5765 >>> s is shallow
5766 False
5767 >>> s.values is shallow.values and s.index is shallow.index
5768 True
5770 Deep copy has own copy of data and index.
5772 >>> s is deep
5773 False
5774 >>> s.values is deep.values or s.index is deep.index
5775 False
5777 Updates to the data shared by shallow copy and original is reflected
5778 in both; deep copy remains unchanged.
5780 >>> s[0] = 3
5781 >>> shallow[1] = 4
5782 >>> s
5783 a 3
5784 b 4
5785 dtype: int64
5786 >>> shallow
5787 a 3
5788 b 4
5789 dtype: int64
5790 >>> deep
5791 a 1
5792 b 2
5793 dtype: int64
5795 Note that when copying an object containing Python objects, a deep copy
5796 will copy the data, but will not do so recursively. Updating a nested
5797 data object will be reflected in the deep copy.
5799 >>> s = pd.Series([[1, 2], [3, 4]])
5800 >>> deep = s.copy()
5801 >>> s[0][0] = 10
5802 >>> s
5803 0 [10, 2]
5804 1 [3, 4]
5805 dtype: object
5806 >>> deep
5807 0 [10, 2]
5808 1 [3, 4]
5809 dtype: object
5810 """
5811 data = self._data.copy(deep=deep)
5812 return self._constructor(data).__finalize__(self)
5814 def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
5815 return self.copy(deep=deep)
5817 def __deepcopy__(self: FrameOrSeries, memo=None) -> FrameOrSeries:
5818 """
5819 Parameters
5820 ----------
5821 memo, default None
5822 Standard signature. Unused
5823 """
5824 return self.copy(deep=True)
5826 def _convert(
5827 self: FrameOrSeries,
5828 datetime: bool_t = False,
5829 numeric: bool_t = False,
5830 timedelta: bool_t = False,
5831 coerce: bool_t = False,
5832 copy: bool_t = True,
5833 ) -> FrameOrSeries:
5834 """
5835 Attempt to infer better dtype for object columns
5837 Parameters
5838 ----------
5839 datetime : bool, default False
5840 If True, convert to date where possible.
5841 numeric : bool, default False
5842 If True, attempt to convert to numbers (including strings), with
5843 unconvertible values becoming NaN.
5844 timedelta : bool, default False
5845 If True, convert to timedelta where possible.
5846 coerce : bool, default False
5847 If True, force conversion with unconvertible values converted to
5848 nulls (NaN or NaT).
5849 copy : bool, default True
5850 If True, return a copy even if no copy is necessary (e.g. no
5851 conversion was done). Note: This is meant for internal use, and
5852 should not be confused with inplace.
5854 Returns
5855 -------
5856 converted : same as input object
5857 """
5858 validate_bool_kwarg(datetime, "datetime")
5859 validate_bool_kwarg(numeric, "numeric")
5860 validate_bool_kwarg(timedelta, "timedelta")
5861 validate_bool_kwarg(coerce, "coerce")
5862 validate_bool_kwarg(copy, "copy")
5863 return self._constructor(
5864 self._data.convert(
5865 datetime=datetime,
5866 numeric=numeric,
5867 timedelta=timedelta,
5868 coerce=coerce,
5869 copy=copy,
5870 )
5871 ).__finalize__(self)
5873 def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
5874 """
5875 Attempt to infer better dtypes for object columns.
5877 Attempts soft conversion of object-dtyped
5878 columns, leaving non-object and unconvertible
5879 columns unchanged. The inference rules are the
5880 same as during normal Series/DataFrame construction.
5882 .. versionadded:: 0.21.0
5884 Returns
5885 -------
5886 converted : same type as input object
5888 See Also
5889 --------
5890 to_datetime : Convert argument to datetime.
5891 to_timedelta : Convert argument to timedelta.
5892 to_numeric : Convert argument to numeric type.
5893 convert_dtypes : Convert argument to best possible dtype.
5895 Examples
5896 --------
5897 >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})
5898 >>> df = df.iloc[1:]
5899 >>> df
5900 A
5901 1 1
5902 2 2
5903 3 3
5905 >>> df.dtypes
5906 A object
5907 dtype: object
5909 >>> df.infer_objects().dtypes
5910 A int64
5911 dtype: object
5912 """
5913 # numeric=False necessary to only soft convert;
5914 # python objects will still be converted to
5915 # native numpy numeric types
5916 return self._constructor(
5917 self._data.convert(
5918 datetime=True, numeric=False, timedelta=True, coerce=False, copy=True
5919 )
5920 ).__finalize__(self)
5922 def convert_dtypes(
5923 self: FrameOrSeries,
5924 infer_objects: bool_t = True,
5925 convert_string: bool_t = True,
5926 convert_integer: bool_t = True,
5927 convert_boolean: bool_t = True,
5928 ) -> FrameOrSeries:
5929 """
5930 Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
5932 .. versionadded:: 1.0.0
5934 Parameters
5935 ----------
5936 infer_objects : bool, default True
5937 Whether object dtypes should be converted to the best possible types.
5938 convert_string : bool, default True
5939 Whether object dtypes should be converted to ``StringDtype()``.
5940 convert_integer : bool, default True
5941 Whether, if possible, conversion can be done to integer extension types.
5942 convert_boolean : bool, defaults True
5943 Whether object dtypes should be converted to ``BooleanDtypes()``.
5945 Returns
5946 -------
5947 Series or DataFrame
5948 Copy of input object with new dtype.
5950 See Also
5951 --------
5952 infer_objects : Infer dtypes of objects.
5953 to_datetime : Convert argument to datetime.
5954 to_timedelta : Convert argument to timedelta.
5955 to_numeric : Convert argument to a numeric type.
5957 Notes
5958 -----
5960 By default, ``convert_dtypes`` will attempt to convert a Series (or each
5961 Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
5962 ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is
5963 possible to turn off individual conversions to ``StringDtype``, the integer
5964 extension types or ``BooleanDtype``, respectively.
5966 For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
5967 rules as during normal Series/DataFrame construction. Then, if possible,
5968 convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension
5969 type, otherwise leave as ``object``.
5971 If the dtype is integer, convert to an appropriate integer extension type.
5973 If the dtype is numeric, and consists of all integers, convert to an
5974 appropriate integer extension type.
5976 In the future, as new dtypes are added that support ``pd.NA``, the results
5977 of this method will change to support those new dtypes.
5979 Examples
5980 --------
5981 >>> df = pd.DataFrame(
5982 ... {
5983 ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
5984 ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
5985 ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
5986 ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
5987 ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
5988 ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
5989 ... }
5990 ... )
5992 Start with a DataFrame with default dtypes.
5994 >>> df
5995 a b c d e f
5996 0 1 x True h 10.0 NaN
5997 1 2 y False i NaN 100.5
5998 2 3 z NaN NaN 20.0 200.0
6000 >>> df.dtypes
6001 a int32
6002 b object
6003 c object
6004 d object
6005 e float64
6006 f float64
6007 dtype: object
6009 Convert the DataFrame to use best possible dtypes.
6011 >>> dfn = df.convert_dtypes()
6012 >>> dfn
6013 a b c d e f
6014 0 1 x True h 10 NaN
6015 1 2 y False i <NA> 100.5
6016 2 3 z <NA> <NA> 20 200.0
6018 >>> dfn.dtypes
6019 a Int32
6020 b string
6021 c boolean
6022 d string
6023 e Int64
6024 f float64
6025 dtype: object
6027 Start with a Series of strings and missing data represented by ``np.nan``.
6029 >>> s = pd.Series(["a", "b", np.nan])
6030 >>> s
6031 0 a
6032 1 b
6033 2 NaN
6034 dtype: object
6036 Obtain a Series with dtype ``StringDtype``.
6038 >>> s.convert_dtypes()
6039 0 a
6040 1 b
6041 2 <NA>
6042 dtype: string
6043 """
6044 if self.ndim == 1:
6045 return self._convert_dtypes(
6046 infer_objects, convert_string, convert_integer, convert_boolean
6047 )
6048 else:
6049 results = [
6050 col._convert_dtypes(
6051 infer_objects, convert_string, convert_integer, convert_boolean
6052 )
6053 for col_name, col in self.items()
6054 ]
6055 result = pd.concat(results, axis=1, copy=False)
6056 return result
6058 # ----------------------------------------------------------------------
6059 # Filling NA's
6061 def fillna(
6062 self: FrameOrSeries,
6063 value=None,
6064 method=None,
6065 axis=None,
6066 inplace: bool_t = False,
6067 limit=None,
6068 downcast=None,
6069 ) -> Optional[FrameOrSeries]:
6070 """
6071 Fill NA/NaN values using the specified method.
6073 Parameters
6074 ----------
6075 value : scalar, dict, Series, or DataFrame
6076 Value to use to fill holes (e.g. 0), alternately a
6077 dict/Series/DataFrame of values specifying which value to use for
6078 each index (for a Series) or column (for a DataFrame). Values not
6079 in the dict/Series/DataFrame will not be filled. This value cannot
6080 be a list.
6081 method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
6082 Method to use for filling holes in reindexed Series
6083 pad / ffill: propagate last valid observation forward to next valid
6084 backfill / bfill: use next valid observation to fill gap.
6085 axis : %(axes_single_arg)s
6086 Axis along which to fill missing values.
6087 inplace : bool, default False
6088 If True, fill in-place. Note: this will modify any
6089 other views on this object (e.g., a no-copy slice for a column in a
6090 DataFrame).
6091 limit : int, default None
6092 If method is specified, this is the maximum number of consecutive
6093 NaN values to forward/backward fill. In other words, if there is
6094 a gap with more than this number of consecutive NaNs, it will only
6095 be partially filled. If method is not specified, this is the
6096 maximum number of entries along the entire axis where NaNs will be
6097 filled. Must be greater than 0 if not None.
6098 downcast : dict, default is None
6099 A dict of item->dtype of what to downcast if possible,
6100 or the string 'infer' which will try to downcast to an appropriate
6101 equal type (e.g. float64 to int64 if possible).
6103 Returns
6104 -------
6105 %(klass)s or None
6106 Object with missing values filled or None if ``inplace=True``.
6108 See Also
6109 --------
6110 interpolate : Fill NaN values using interpolation.
6111 reindex : Conform object to new index.
6112 asfreq : Convert TimeSeries to specified frequency.
6114 Examples
6115 --------
6116 >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
6117 ... [3, 4, np.nan, 1],
6118 ... [np.nan, np.nan, np.nan, 5],
6119 ... [np.nan, 3, np.nan, 4]],
6120 ... columns=list('ABCD'))
6121 >>> df
6122 A B C D
6123 0 NaN 2.0 NaN 0
6124 1 3.0 4.0 NaN 1
6125 2 NaN NaN NaN 5
6126 3 NaN 3.0 NaN 4
6128 Replace all NaN elements with 0s.
6130 >>> df.fillna(0)
6131 A B C D
6132 0 0.0 2.0 0.0 0
6133 1 3.0 4.0 0.0 1
6134 2 0.0 0.0 0.0 5
6135 3 0.0 3.0 0.0 4
6137 We can also propagate non-null values forward or backward.
6139 >>> df.fillna(method='ffill')
6140 A B C D
6141 0 NaN 2.0 NaN 0
6142 1 3.0 4.0 NaN 1
6143 2 3.0 4.0 NaN 5
6144 3 3.0 3.0 NaN 4
6146 Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
6147 2, and 3 respectively.
6149 >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
6150 >>> df.fillna(value=values)
6151 A B C D
6152 0 0.0 2.0 2.0 0
6153 1 3.0 4.0 2.0 1
6154 2 0.0 1.0 2.0 5
6155 3 0.0 3.0 2.0 4
6157 Only replace the first NaN element.
6159 >>> df.fillna(value=values, limit=1)
6160 A B C D
6161 0 0.0 2.0 2.0 0
6162 1 3.0 4.0 NaN 1
6163 2 NaN 1.0 NaN 5
6164 3 NaN 3.0 NaN 4
6165 """
6166 inplace = validate_bool_kwarg(inplace, "inplace")
6167 value, method = validate_fillna_kwargs(value, method)
6169 self._consolidate_inplace()
6171 # set the default here, so functions examining the signaure
6172 # can detect if something was set (e.g. in groupby) (GH9221)
6173 if axis is None:
6174 axis = 0
6175 axis = self._get_axis_number(axis)
6177 if value is None:
6179 if self._is_mixed_type and axis == 1:
6180 if inplace:
6181 raise NotImplementedError()
6182 result = self.T.fillna(method=method, limit=limit).T
6184 # need to downcast here because of all of the transposes
6185 result._data = result._data.downcast()
6187 return result
6189 new_data = self._data.interpolate(
6190 method=method,
6191 axis=axis,
6192 limit=limit,
6193 inplace=inplace,
6194 coerce=True,
6195 downcast=downcast,
6196 )
6197 else:
6198 if len(self._get_axis(axis)) == 0:
6199 return self
6201 if self.ndim == 1:
6202 if isinstance(value, (dict, ABCSeries)):
6203 value = create_series_with_explicit_dtype(
6204 value, dtype_if_empty=object
6205 )
6206 elif not is_list_like(value):
6207 pass
6208 else:
6209 raise TypeError(
6210 '"value" parameter must be a scalar, dict '
6211 "or Series, but you passed a "
6212 f'"{type(value).__name__}"'
6213 )
6215 new_data = self._data.fillna(
6216 value=value, limit=limit, inplace=inplace, downcast=downcast
6217 )
6219 elif isinstance(value, (dict, ABCSeries)):
6220 if axis == 1:
6221 raise NotImplementedError(
6222 "Currently only can fill "
6223 "with dict/Series column "
6224 "by column"
6225 )
6227 result = self if inplace else self.copy()
6228 for k, v in value.items():
6229 if k not in result:
6230 continue
6231 obj = result[k]
6232 obj.fillna(v, limit=limit, inplace=True, downcast=downcast)
6233 return result if not inplace else None
6235 elif not is_list_like(value):
6236 new_data = self._data.fillna(
6237 value=value, limit=limit, inplace=inplace, downcast=downcast
6238 )
6239 elif isinstance(value, ABCDataFrame) and self.ndim == 2:
6240 new_data = self.where(self.notna(), value)
6241 else:
6242 raise ValueError(f"invalid fill value with a {type(value)}")
6244 if inplace:
6245 self._update_inplace(new_data)
6246 return None
6247 else:
6248 return self._constructor(new_data).__finalize__(self)
6250 def ffill(
6251 self: FrameOrSeries,
6252 axis=None,
6253 inplace: bool_t = False,
6254 limit=None,
6255 downcast=None,
6256 ) -> Optional[FrameOrSeries]:
6257 """
6258 Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.
6260 Returns
6261 -------
6262 %(klass)s or None
6263 Object with missing values filled or None if ``inplace=True``.
6264 """
6265 return self.fillna(
6266 method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
6267 )
6269 def bfill(
6270 self: FrameOrSeries,
6271 axis=None,
6272 inplace: bool_t = False,
6273 limit=None,
6274 downcast=None,
6275 ) -> Optional[FrameOrSeries]:
6276 """
6277 Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
6279 Returns
6280 -------
6281 %(klass)s or None
6282 Object with missing values filled or None if ``inplace=True``.
6283 """
6284 return self.fillna(
6285 method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
6286 )
6288 _shared_docs[
6289 "replace"
6290 ] = """
6291 Replace values given in `to_replace` with `value`.
6293 Values of the %(klass)s are replaced with other values dynamically.
6294 This differs from updating with ``.loc`` or ``.iloc``, which require
6295 you to specify a location to update with some value.
6297 Parameters
6298 ----------
6299 to_replace : str, regex, list, dict, Series, int, float, or None
6300 How to find the values that will be replaced.
6302 * numeric, str or regex:
6304 - numeric: numeric values equal to `to_replace` will be
6305 replaced with `value`
6306 - str: string exactly matching `to_replace` will be replaced
6307 with `value`
6308 - regex: regexs matching `to_replace` will be replaced with
6309 `value`
6311 * list of str, regex, or numeric:
6313 - First, if `to_replace` and `value` are both lists, they
6314 **must** be the same length.
6315 - Second, if ``regex=True`` then all of the strings in **both**
6316 lists will be interpreted as regexs otherwise they will match
6317 directly. This doesn't matter much for `value` since there
6318 are only a few possible substitution regexes you can use.
6319 - str, regex and numeric rules apply as above.
6321 * dict:
6323 - Dicts can be used to specify different replacement values
6324 for different existing values. For example,
6325 ``{'a': 'b', 'y': 'z'}`` replaces the value 'a' with 'b' and
6326 'y' with 'z'. To use a dict in this way the `value`
6327 parameter should be `None`.
6328 - For a DataFrame a dict can specify that different values
6329 should be replaced in different columns. For example,
6330 ``{'a': 1, 'b': 'z'}`` looks for the value 1 in column 'a'
6331 and the value 'z' in column 'b' and replaces these values
6332 with whatever is specified in `value`. The `value` parameter
6333 should not be ``None`` in this case. You can treat this as a
6334 special case of passing two lists except that you are
6335 specifying the column to search in.
6336 - For a DataFrame nested dictionaries, e.g.,
6337 ``{'a': {'b': np.nan}}``, are read as follows: look in column
6338 'a' for the value 'b' and replace it with NaN. The `value`
6339 parameter should be ``None`` to use a nested dict in this
6340 way. You can nest regular expressions as well. Note that
6341 column names (the top-level dictionary keys in a nested
6342 dictionary) **cannot** be regular expressions.
6344 * None:
6346 - This means that the `regex` argument must be a string,
6347 compiled regular expression, or list, dict, ndarray or
6348 Series of such elements. If `value` is also ``None`` then
6349 this **must** be a nested dictionary or Series.
6351 See the examples section for examples of each of these.
6352 value : scalar, dict, list, str, regex, default None
6353 Value to replace any values matching `to_replace` with.
6354 For a DataFrame a dict of values can be used to specify which
6355 value to use for each column (columns not in the dict will not be
6356 filled). Regular expressions, strings and lists or dicts of such
6357 objects are also allowed.
6358 inplace : bool, default False
6359 If True, in place. Note: this will modify any
6360 other views on this object (e.g. a column from a DataFrame).
6361 Returns the caller if this is True.
6362 limit : int, default None
6363 Maximum size gap to forward or backward fill.
6364 regex : bool or same types as `to_replace`, default False
6365 Whether to interpret `to_replace` and/or `value` as regular
6366 expressions. If this is ``True`` then `to_replace` *must* be a
6367 string. Alternatively, this could be a regular expression or a
6368 list, dict, or array of regular expressions in which case
6369 `to_replace` must be ``None``.
6370 method : {'pad', 'ffill', 'bfill', `None`}
6371 The method to use when for replacement, when `to_replace` is a
6372 scalar, list or tuple and `value` is ``None``.
6374 .. versionchanged:: 0.23.0
6375 Added to DataFrame.
6377 Returns
6378 -------
6379 %(klass)s
6380 Object after replacement.
6382 Raises
6383 ------
6384 AssertionError
6385 * If `regex` is not a ``bool`` and `to_replace` is not
6386 ``None``.
6387 TypeError
6388 * If `to_replace` is a ``dict`` and `value` is not a ``list``,
6389 ``dict``, ``ndarray``, or ``Series``
6390 * If `to_replace` is ``None`` and `regex` is not compilable
6391 into a regular expression or is a list, dict, ndarray, or
6392 Series.
6393 * When replacing multiple ``bool`` or ``datetime64`` objects and
6394 the arguments to `to_replace` does not match the type of the
6395 value being replaced
6396 ValueError
6397 * If a ``list`` or an ``ndarray`` is passed to `to_replace` and
6398 `value` but they are not the same length.
6400 See Also
6401 --------
6402 %(klass)s.fillna : Fill NA values.
6403 %(klass)s.where : Replace values based on boolean condition.
6404 Series.str.replace : Simple string replacement.
6406 Notes
6407 -----
6408 * Regex substitution is performed under the hood with ``re.sub``. The
6409 rules for substitution for ``re.sub`` are the same.
6410 * Regular expressions will only substitute on strings, meaning you
6411 cannot provide, for example, a regular expression matching floating
6412 point numbers and expect the columns in your frame that have a
6413 numeric dtype to be matched. However, if those floating point
6414 numbers *are* strings, then you can do this.
6415 * This method has *a lot* of options. You are encouraged to experiment
6416 and play with this method to gain intuition about how it works.
6417 * When dict is used as the `to_replace` value, it is like
6418 key(s) in the dict are the to_replace part and
6419 value(s) in the dict are the value parameter.
6421 Examples
6422 --------
6424 **Scalar `to_replace` and `value`**
6426 >>> s = pd.Series([0, 1, 2, 3, 4])
6427 >>> s.replace(0, 5)
6428 0 5
6429 1 1
6430 2 2
6431 3 3
6432 4 4
6433 dtype: int64
6435 >>> df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
6436 ... 'B': [5, 6, 7, 8, 9],
6437 ... 'C': ['a', 'b', 'c', 'd', 'e']})
6438 >>> df.replace(0, 5)
6439 A B C
6440 0 5 5 a
6441 1 1 6 b
6442 2 2 7 c
6443 3 3 8 d
6444 4 4 9 e
6446 **List-like `to_replace`**
6448 >>> df.replace([0, 1, 2, 3], 4)
6449 A B C
6450 0 4 5 a
6451 1 4 6 b
6452 2 4 7 c
6453 3 4 8 d
6454 4 4 9 e
6456 >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1])
6457 A B C
6458 0 4 5 a
6459 1 3 6 b
6460 2 2 7 c
6461 3 1 8 d
6462 4 4 9 e
6464 >>> s.replace([1, 2], method='bfill')
6465 0 0
6466 1 3
6467 2 3
6468 3 3
6469 4 4
6470 dtype: int64
6472 **dict-like `to_replace`**
6474 >>> df.replace({0: 10, 1: 100})
6475 A B C
6476 0 10 5 a
6477 1 100 6 b
6478 2 2 7 c
6479 3 3 8 d
6480 4 4 9 e
6482 >>> df.replace({'A': 0, 'B': 5}, 100)
6483 A B C
6484 0 100 100 a
6485 1 1 6 b
6486 2 2 7 c
6487 3 3 8 d
6488 4 4 9 e
6490 >>> df.replace({'A': {0: 100, 4: 400}})
6491 A B C
6492 0 100 5 a
6493 1 1 6 b
6494 2 2 7 c
6495 3 3 8 d
6496 4 400 9 e
6498 **Regular expression `to_replace`**
6500 >>> df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
6501 ... 'B': ['abc', 'bar', 'xyz']})
6502 >>> df.replace(to_replace=r'^ba.$', value='new', regex=True)
6503 A B
6504 0 new abc
6505 1 foo new
6506 2 bait xyz
6508 >>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True)
6509 A B
6510 0 new abc
6511 1 foo bar
6512 2 bait xyz
6514 >>> df.replace(regex=r'^ba.$', value='new')
6515 A B
6516 0 new abc
6517 1 foo new
6518 2 bait xyz
6520 >>> df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})
6521 A B
6522 0 new abc
6523 1 xyz new
6524 2 bait xyz
6526 >>> df.replace(regex=[r'^ba.$', 'foo'], value='new')
6527 A B
6528 0 new abc
6529 1 new new
6530 2 bait xyz
6532 Note that when replacing multiple ``bool`` or ``datetime64`` objects,
6533 the data types in the `to_replace` parameter must match the data
6534 type of the value being replaced:
6536 >>> df = pd.DataFrame({'A': [True, False, True],
6537 ... 'B': [False, True, False]})
6538 >>> df.replace({'a string': 'new value', True: False}) # raises
6539 Traceback (most recent call last):
6540 ...
6541 TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str'
6543 This raises a ``TypeError`` because one of the ``dict`` keys is not of
6544 the correct type for replacement.
6546 Compare the behavior of ``s.replace({'a': None})`` and
6547 ``s.replace('a', None)`` to understand the peculiarities
6548 of the `to_replace` parameter:
6550 >>> s = pd.Series([10, 'a', 'a', 'b', 'a'])
6552 When one uses a dict as the `to_replace` value, it is like the
6553 value(s) in the dict are equal to the `value` parameter.
6554 ``s.replace({'a': None})`` is equivalent to
6555 ``s.replace(to_replace={'a': None}, value=None, method=None)``:
6557 >>> s.replace({'a': None})
6558 0 10
6559 1 None
6560 2 None
6561 3 b
6562 4 None
6563 dtype: object
6565 When ``value=None`` and `to_replace` is a scalar, list or
6566 tuple, `replace` uses the method parameter (default 'pad') to do the
6567 replacement. So this is why the 'a' values are being replaced by 10
6568 in rows 1 and 2 and 'b' in row 4 in this case.
6569 The command ``s.replace('a', None)`` is actually equivalent to
6570 ``s.replace(to_replace='a', value=None, method='pad')``:
6572 >>> s.replace('a', None)
6573 0 10
6574 1 10
6575 2 10
6576 3 b
6577 4 b
6578 dtype: object
6579 """
6581 @Appender(_shared_docs["replace"] % _shared_doc_kwargs)
6582 def replace(
6583 self,
6584 to_replace=None,
6585 value=None,
6586 inplace=False,
6587 limit=None,
6588 regex=False,
6589 method="pad",
6590 ):
6591 inplace = validate_bool_kwarg(inplace, "inplace")
6592 if not is_bool(regex) and to_replace is not None:
6593 raise AssertionError("'to_replace' must be 'None' if 'regex' is not a bool")
6595 self._consolidate_inplace()
6597 if value is None:
6598 # passing a single value that is scalar like
6599 # when value is None (GH5319), for compat
6600 if not is_dict_like(to_replace) and not is_dict_like(regex):
6601 to_replace = [to_replace]
6603 if isinstance(to_replace, (tuple, list)):
6604 if isinstance(self, ABCDataFrame):
6605 return self.apply(
6606 _single_replace, args=(to_replace, method, inplace, limit)
6607 )
6608 return _single_replace(self, to_replace, method, inplace, limit)
6610 if not is_dict_like(to_replace):
6611 if not is_dict_like(regex):
6612 raise TypeError(
6613 'If "to_replace" and "value" are both None '
6614 'and "to_replace" is not a list, then '
6615 "regex must be a mapping"
6616 )
6617 to_replace = regex
6618 regex = True
6620 items = list(to_replace.items())
6621 keys, values = zip(*items) if items else ([], [])
6623 are_mappings = [is_dict_like(v) for v in values]
6625 if any(are_mappings):
6626 if not all(are_mappings):
6627 raise TypeError(
6628 "If a nested mapping is passed, all values "
6629 "of the top level mapping must be mappings"
6630 )
6631 # passed a nested dict/Series
6632 to_rep_dict = {}
6633 value_dict = {}
6635 for k, v in items:
6636 keys, values = list(zip(*v.items())) or ([], [])
6638 to_rep_dict[k] = list(keys)
6639 value_dict[k] = list(values)
6641 to_replace, value = to_rep_dict, value_dict
6642 else:
6643 to_replace, value = keys, values
6645 return self.replace(
6646 to_replace, value, inplace=inplace, limit=limit, regex=regex
6647 )
6648 else:
6650 # need a non-zero len on all axes
6651 if not self.size:
6652 return self
6654 new_data = self._data
6655 if is_dict_like(to_replace):
6656 if is_dict_like(value): # {'A' : NA} -> {'A' : 0}
6657 res = self if inplace else self.copy()
6658 for c, src in to_replace.items():
6659 if c in value and c in self:
6660 # object conversion is handled in
6661 # series.replace which is called recursively
6662 res[c] = res[c].replace(
6663 to_replace=src,
6664 value=value[c],
6665 inplace=False,
6666 regex=regex,
6667 )
6668 return None if inplace else res
6670 # {'A': NA} -> 0
6671 elif not is_list_like(value):
6672 keys = [(k, src) for k, src in to_replace.items() if k in self]
6673 keys_len = len(keys) - 1
6674 for i, (k, src) in enumerate(keys):
6675 convert = i == keys_len
6676 new_data = new_data.replace(
6677 to_replace=src,
6678 value=value,
6679 filter=[k],
6680 inplace=inplace,
6681 regex=regex,
6682 convert=convert,
6683 )
6684 else:
6685 raise TypeError("value argument must be scalar, dict, or Series")
6687 elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing']
6688 if is_list_like(value):
6689 if len(to_replace) != len(value):
6690 raise ValueError(
6691 f"Replacement lists must match in length. "
6692 f"Expecting {len(to_replace)} got {len(value)} "
6693 )
6695 new_data = self._data.replace_list(
6696 src_list=to_replace,
6697 dest_list=value,
6698 inplace=inplace,
6699 regex=regex,
6700 )
6702 else: # [NA, ''] -> 0
6703 new_data = self._data.replace(
6704 to_replace=to_replace, value=value, inplace=inplace, regex=regex
6705 )
6706 elif to_replace is None:
6707 if not (
6708 is_re_compilable(regex)
6709 or is_list_like(regex)
6710 or is_dict_like(regex)
6711 ):
6712 raise TypeError(
6713 f"'regex' must be a string or a compiled regular expression "
6714 f"or a list or dict of strings or regular expressions, "
6715 f"you passed a {repr(type(regex).__name__)}"
6716 )
6717 return self.replace(
6718 regex, value, inplace=inplace, limit=limit, regex=True
6719 )
6720 else:
6722 # dest iterable dict-like
6723 if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1}
6724 new_data = self._data
6726 for k, v in value.items():
6727 if k in self:
6728 new_data = new_data.replace(
6729 to_replace=to_replace,
6730 value=v,
6731 filter=[k],
6732 inplace=inplace,
6733 regex=regex,
6734 )
6736 elif not is_list_like(value): # NA -> 0
6737 new_data = self._data.replace(
6738 to_replace=to_replace, value=value, inplace=inplace, regex=regex
6739 )
6740 else:
6741 raise TypeError(
6742 f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}'
6743 )
6745 if inplace:
6746 self._update_inplace(new_data)
6747 else:
6748 return self._constructor(new_data).__finalize__(self)
6750 _shared_docs[
6751 "interpolate"
6752 ] = """
6753 Please note that only ``method='linear'`` is supported for
6754 DataFrame/Series with a MultiIndex.
6756 Parameters
6757 ----------
6758 method : str, default 'linear'
6759 Interpolation technique to use. One of:
6761 * 'linear': Ignore the index and treat the values as equally
6762 spaced. This is the only method supported on MultiIndexes.
6763 * 'time': Works on daily and higher resolution data to interpolate
6764 given length of interval.
6765 * 'index', 'values': use the actual numerical values of the index.
6766 * 'pad': Fill in NaNs using existing values.
6767 * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'spline',
6768 'barycentric', 'polynomial': Passed to
6769 `scipy.interpolate.interp1d`. These methods use the numerical
6770 values of the index. Both 'polynomial' and 'spline' require that
6771 you also specify an `order` (int), e.g.
6772 ``df.interpolate(method='polynomial', order=5)``.
6773 * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima':
6774 Wrappers around the SciPy interpolation methods of similar
6775 names. See `Notes`.
6776 * 'from_derivatives': Refers to
6777 `scipy.interpolate.BPoly.from_derivatives` which
6778 replaces 'piecewise_polynomial' interpolation method in
6779 scipy 0.18.
6780 axis : {0 or 'index', 1 or 'columns', None}, default None
6781 Axis to interpolate along.
6782 limit : int, optional
6783 Maximum number of consecutive NaNs to fill. Must be greater than
6784 0.
6785 inplace : bool, default False
6786 Update the data in place if possible.
6787 limit_direction : {'forward', 'backward', 'both'}, default 'forward'
6788 If limit is specified, consecutive NaNs will be filled in this
6789 direction.
6790 limit_area : {`None`, 'inside', 'outside'}, default None
6791 If limit is specified, consecutive NaNs will be filled with this
6792 restriction.
6794 * ``None``: No fill restriction.
6795 * 'inside': Only fill NaNs surrounded by valid values
6796 (interpolate).
6797 * 'outside': Only fill NaNs outside valid values (extrapolate).
6799 .. versionadded:: 0.23.0
6801 downcast : optional, 'infer' or None, defaults to None
6802 Downcast dtypes if possible.
6803 **kwargs
6804 Keyword arguments to pass on to the interpolating function.
6806 Returns
6807 -------
6808 Series or DataFrame
6809 Returns the same object type as the caller, interpolated at
6810 some or all ``NaN`` values.
6812 See Also
6813 --------
6814 fillna : Fill missing values using different methods.
6815 scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials
6816 (Akima interpolator).
6817 scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the
6818 Bernstein basis.
6819 scipy.interpolate.interp1d : Interpolate a 1-D function.
6820 scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh
6821 interpolator).
6822 scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic
6823 interpolation.
6824 scipy.interpolate.CubicSpline : Cubic spline data interpolator.
6826 Notes
6827 -----
6828 The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'
6829 methods are wrappers around the respective SciPy implementations of
6830 similar names. These use the actual numerical values of the index.
6831 For more information on their behavior, see the
6832 `SciPy documentation
6833 <http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>`__
6834 and `SciPy tutorial
6835 <http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html>`__.
6837 Examples
6838 --------
6839 Filling in ``NaN`` in a :class:`~pandas.Series` via linear
6840 interpolation.
6842 >>> s = pd.Series([0, 1, np.nan, 3])
6843 >>> s
6844 0 0.0
6845 1 1.0
6846 2 NaN
6847 3 3.0
6848 dtype: float64
6849 >>> s.interpolate()
6850 0 0.0
6851 1 1.0
6852 2 2.0
6853 3 3.0
6854 dtype: float64
6856 Filling in ``NaN`` in a Series by padding, but filling at most two
6857 consecutive ``NaN`` at a time.
6859 >>> s = pd.Series([np.nan, "single_one", np.nan,
6860 ... "fill_two_more", np.nan, np.nan, np.nan,
6861 ... 4.71, np.nan])
6862 >>> s
6863 0 NaN
6864 1 single_one
6865 2 NaN
6866 3 fill_two_more
6867 4 NaN
6868 5 NaN
6869 6 NaN
6870 7 4.71
6871 8 NaN
6872 dtype: object
6873 >>> s.interpolate(method='pad', limit=2)
6874 0 NaN
6875 1 single_one
6876 2 single_one
6877 3 fill_two_more
6878 4 fill_two_more
6879 5 fill_two_more
6880 6 NaN
6881 7 4.71
6882 8 4.71
6883 dtype: object
6885 Filling in ``NaN`` in a Series via polynomial interpolation or splines:
6886 Both 'polynomial' and 'spline' methods require that you also specify
6887 an ``order`` (int).
6889 >>> s = pd.Series([0, 2, np.nan, 8])
6890 >>> s.interpolate(method='polynomial', order=2)
6891 0 0.000000
6892 1 2.000000
6893 2 4.666667
6894 3 8.000000
6895 dtype: float64
6897 Fill the DataFrame forward (that is, going down) along each column
6898 using linear interpolation.
6900 Note how the last entry in column 'a' is interpolated differently,
6901 because there is no entry after it to use for interpolation.
6902 Note how the first entry in column 'b' remains ``NaN``, because there
6903 is no entry before it to use for interpolation.
6905 >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),
6906 ... (np.nan, 2.0, np.nan, np.nan),
6907 ... (2.0, 3.0, np.nan, 9.0),
6908 ... (np.nan, 4.0, -4.0, 16.0)],
6909 ... columns=list('abcd'))
6910 >>> df
6911 a b c d
6912 0 0.0 NaN -1.0 1.0
6913 1 NaN 2.0 NaN NaN
6914 2 2.0 3.0 NaN 9.0
6915 3 NaN 4.0 -4.0 16.0
6916 >>> df.interpolate(method='linear', limit_direction='forward', axis=0)
6917 a b c d
6918 0 0.0 NaN -1.0 1.0
6919 1 1.0 2.0 -2.0 5.0
6920 2 2.0 3.0 -3.0 9.0
6921 3 2.0 4.0 -4.0 16.0
6923 Using polynomial interpolation.
6925 >>> df['d'].interpolate(method='polynomial', order=2)
6926 0 1.0
6927 1 4.0
6928 2 9.0
6929 3 16.0
6930 Name: d, dtype: float64
6931 """
6933 @Appender(_shared_docs["interpolate"] % _shared_doc_kwargs)
6934 def interpolate(
6935 self,
6936 method="linear",
6937 axis=0,
6938 limit=None,
6939 inplace=False,
6940 limit_direction="forward",
6941 limit_area=None,
6942 downcast=None,
6943 **kwargs,
6944 ):
6945 """
6946 Interpolate values according to different methods.
6947 """
6948 inplace = validate_bool_kwarg(inplace, "inplace")
6950 axis = self._get_axis_number(axis)
6952 if axis == 0:
6953 ax = self._info_axis_name
6954 _maybe_transposed_self = self
6955 elif axis == 1:
6956 _maybe_transposed_self = self.T
6957 ax = 1
6959 ax = _maybe_transposed_self._get_axis_number(ax)
6961 if _maybe_transposed_self.ndim == 2:
6962 alt_ax = 1 - ax
6963 else:
6964 alt_ax = ax
6966 if isinstance(_maybe_transposed_self.index, MultiIndex) and method != "linear":
6967 raise ValueError(
6968 "Only `method=linear` interpolation is supported on MultiIndexes."
6969 )
6971 if _maybe_transposed_self._data.get_dtype_counts().get("object") == len(
6972 _maybe_transposed_self.T
6973 ):
6974 raise TypeError(
6975 "Cannot interpolate with all object-dtype columns "
6976 "in the DataFrame. Try setting at least one "
6977 "column to a numeric dtype."
6978 )
6980 # create/use the index
6981 if method == "linear":
6982 # prior default
6983 index = np.arange(len(_maybe_transposed_self._get_axis(alt_ax)))
6984 else:
6985 index = _maybe_transposed_self._get_axis(alt_ax)
6986 methods = {"index", "values", "nearest", "time"}
6987 is_numeric_or_datetime = (
6988 is_numeric_dtype(index)
6989 or is_datetime64_any_dtype(index)
6990 or is_timedelta64_dtype(index)
6991 )
6992 if method not in methods and not is_numeric_or_datetime:
6993 raise ValueError(
6994 "Index column must be numeric or datetime type when "
6995 f"using {method} method other than linear. "
6996 "Try setting a numeric or datetime index column before "
6997 "interpolating."
6998 )
7000 if isna(index).any():
7001 raise NotImplementedError(
7002 "Interpolation with NaNs in the index "
7003 "has not been implemented. Try filling "
7004 "those NaNs before interpolating."
7005 )
7006 data = _maybe_transposed_self._data
7007 new_data = data.interpolate(
7008 method=method,
7009 axis=ax,
7010 index=index,
7011 values=_maybe_transposed_self,
7012 limit=limit,
7013 limit_direction=limit_direction,
7014 limit_area=limit_area,
7015 inplace=inplace,
7016 downcast=downcast,
7017 **kwargs,
7018 )
7020 if inplace:
7021 if axis == 1:
7022 new_data = self._constructor(new_data).T._data
7023 self._update_inplace(new_data)
7024 else:
7025 res = self._constructor(new_data).__finalize__(self)
7026 if axis == 1:
7027 res = res.T
7028 return res
7030 # ----------------------------------------------------------------------
7031 # Timeseries methods Methods
7033 def asof(self, where, subset=None):
7034 """
7035 Return the last row(s) without any NaNs before `where`.
7037 The last row (for each element in `where`, if list) without any
7038 NaN is taken.
7039 In case of a :class:`~pandas.DataFrame`, the last row without NaN
7040 considering only the subset of columns (if not `None`)
7042 If there is no good value, NaN is returned for a Series or
7043 a Series of NaN values for a DataFrame
7045 Parameters
7046 ----------
7047 where : date or array-like of dates
7048 Date(s) before which the last row(s) are returned.
7049 subset : str or array-like of str, default `None`
7050 For DataFrame, if not `None`, only use these columns to
7051 check for NaNs.
7053 Returns
7054 -------
7055 scalar, Series, or DataFrame
7057 The return can be:
7059 * scalar : when `self` is a Series and `where` is a scalar
7060 * Series: when `self` is a Series and `where` is an array-like,
7061 or when `self` is a DataFrame and `where` is a scalar
7062 * DataFrame : when `self` is a DataFrame and `where` is an
7063 array-like
7065 Return scalar, Series, or DataFrame.
7067 See Also
7068 --------
7069 merge_asof : Perform an asof merge. Similar to left join.
7071 Notes
7072 -----
7073 Dates are assumed to be sorted. Raises if this is not the case.
7075 Examples
7076 --------
7077 A Series and a scalar `where`.
7079 >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])
7080 >>> s
7081 10 1.0
7082 20 2.0
7083 30 NaN
7084 40 4.0
7085 dtype: float64
7087 >>> s.asof(20)
7088 2.0
7090 For a sequence `where`, a Series is returned. The first value is
7091 NaN, because the first element of `where` is before the first
7092 index value.
7094 >>> s.asof([5, 20])
7095 5 NaN
7096 20 2.0
7097 dtype: float64
7099 Missing values are not considered. The following is ``2.0``, not
7100 NaN, even though NaN is at the index location for ``30``.
7102 >>> s.asof(30)
7103 2.0
7105 Take all columns into consideration
7107 >>> df = pd.DataFrame({'a': [10, 20, 30, 40, 50],
7108 ... 'b': [None, None, None, None, 500]},
7109 ... index=pd.DatetimeIndex(['2018-02-27 09:01:00',
7110 ... '2018-02-27 09:02:00',
7111 ... '2018-02-27 09:03:00',
7112 ... '2018-02-27 09:04:00',
7113 ... '2018-02-27 09:05:00']))
7114 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
7115 ... '2018-02-27 09:04:30']))
7116 a b
7117 2018-02-27 09:03:30 NaN NaN
7118 2018-02-27 09:04:30 NaN NaN
7120 Take a single column into consideration
7122 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
7123 ... '2018-02-27 09:04:30']),
7124 ... subset=['a'])
7125 a b
7126 2018-02-27 09:03:30 30.0 NaN
7127 2018-02-27 09:04:30 40.0 NaN
7128 """
7129 if isinstance(where, str):
7130 where = Timestamp(where)
7132 if not self.index.is_monotonic:
7133 raise ValueError("asof requires a sorted index")
7135 is_series = isinstance(self, ABCSeries)
7136 if is_series:
7137 if subset is not None:
7138 raise ValueError("subset is not valid for Series")
7139 else:
7140 if subset is None:
7141 subset = self.columns
7142 if not is_list_like(subset):
7143 subset = [subset]
7145 is_list = is_list_like(where)
7146 if not is_list:
7147 start = self.index[0]
7148 if isinstance(self.index, PeriodIndex):
7149 where = Period(where, freq=self.index.freq)
7151 if where < start:
7152 if not is_series:
7153 from pandas import Series
7155 return Series(index=self.columns, name=where, dtype=np.float64)
7156 return np.nan
7158 # It's always much faster to use a *while* loop here for
7159 # Series than pre-computing all the NAs. However a
7160 # *while* loop is extremely expensive for DataFrame
7161 # so we later pre-compute all the NAs and use the same
7162 # code path whether *where* is a scalar or list.
7163 # See PR: https://github.com/pandas-dev/pandas/pull/14476
7164 if is_series:
7165 loc = self.index.searchsorted(where, side="right")
7166 if loc > 0:
7167 loc -= 1
7169 values = self._values
7170 while loc > 0 and isna(values[loc]):
7171 loc -= 1
7172 return values[loc]
7174 if not isinstance(where, Index):
7175 where = Index(where) if is_list else Index([where])
7177 nulls = self.isna() if is_series else self[subset].isna().any(1)
7178 if nulls.all():
7179 if is_series:
7180 return self._constructor(np.nan, index=where, name=self.name)
7181 elif is_list:
7182 from pandas import DataFrame
7184 return DataFrame(np.nan, index=where, columns=self.columns)
7185 else:
7186 from pandas import Series
7188 return Series(np.nan, index=self.columns, name=where[0])
7190 locs = self.index.asof_locs(where, ~(nulls.values))
7192 # mask the missing
7193 missing = locs == -1
7194 data = self.take(locs)
7195 data.index = where
7196 data.loc[missing] = np.nan
7197 return data if is_list else data.iloc[-1]
7199 # ----------------------------------------------------------------------
7200 # Action Methods
7202 _shared_docs[
7203 "isna"
7204 ] = """
7205 Detect missing values.
7207 Return a boolean same-sized object indicating if the values are NA.
7208 NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
7209 values.
7210 Everything else gets mapped to False values. Characters such as empty
7211 strings ``''`` or :attr:`numpy.inf` are not considered NA values
7212 (unless you set ``pandas.options.mode.use_inf_as_na = True``).
7214 Returns
7215 -------
7216 %(klass)s
7217 Mask of bool values for each element in %(klass)s that
7218 indicates whether an element is not an NA value.
7220 See Also
7221 --------
7222 %(klass)s.isnull : Alias of isna.
7223 %(klass)s.notna : Boolean inverse of isna.
7224 %(klass)s.dropna : Omit axes labels with missing values.
7225 isna : Top-level isna.
7227 Examples
7228 --------
7229 Show which entries in a DataFrame are NA.
7231 >>> df = pd.DataFrame({'age': [5, 6, np.NaN],
7232 ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'),
7233 ... pd.Timestamp('1940-04-25')],
7234 ... 'name': ['Alfred', 'Batman', ''],
7235 ... 'toy': [None, 'Batmobile', 'Joker']})
7236 >>> df
7237 age born name toy
7238 0 5.0 NaT Alfred None
7239 1 6.0 1939-05-27 Batman Batmobile
7240 2 NaN 1940-04-25 Joker
7242 >>> df.isna()
7243 age born name toy
7244 0 False True False True
7245 1 False False False False
7246 2 True False False False
7248 Show which entries in a Series are NA.
7250 >>> ser = pd.Series([5, 6, np.NaN])
7251 >>> ser
7252 0 5.0
7253 1 6.0
7254 2 NaN
7255 dtype: float64
7257 >>> ser.isna()
7258 0 False
7259 1 False
7260 2 True
7261 dtype: bool
7262 """
7264 @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
7265 def isna(self: FrameOrSeries) -> FrameOrSeries:
7266 return isna(self).__finalize__(self)
7268 @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
7269 def isnull(self: FrameOrSeries) -> FrameOrSeries:
7270 return isna(self).__finalize__(self)
7272 _shared_docs[
7273 "notna"
7274 ] = """
7275 Detect existing (non-missing) values.
7277 Return a boolean same-sized object indicating if the values are not NA.
7278 Non-missing values get mapped to True. Characters such as empty
7279 strings ``''`` or :attr:`numpy.inf` are not considered NA values
7280 (unless you set ``pandas.options.mode.use_inf_as_na = True``).
7281 NA values, such as None or :attr:`numpy.NaN`, get mapped to False
7282 values.
7284 Returns
7285 -------
7286 %(klass)s
7287 Mask of bool values for each element in %(klass)s that
7288 indicates whether an element is not an NA value.
7290 See Also
7291 --------
7292 %(klass)s.notnull : Alias of notna.
7293 %(klass)s.isna : Boolean inverse of notna.
7294 %(klass)s.dropna : Omit axes labels with missing values.
7295 notna : Top-level notna.
7297 Examples
7298 --------
7299 Show which entries in a DataFrame are not NA.
7301 >>> df = pd.DataFrame({'age': [5, 6, np.NaN],
7302 ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'),
7303 ... pd.Timestamp('1940-04-25')],
7304 ... 'name': ['Alfred', 'Batman', ''],
7305 ... 'toy': [None, 'Batmobile', 'Joker']})
7306 >>> df
7307 age born name toy
7308 0 5.0 NaT Alfred None
7309 1 6.0 1939-05-27 Batman Batmobile
7310 2 NaN 1940-04-25 Joker
7312 >>> df.notna()
7313 age born name toy
7314 0 True False True False
7315 1 True True True True
7316 2 False True True True
7318 Show which entries in a Series are not NA.
7320 >>> ser = pd.Series([5, 6, np.NaN])
7321 >>> ser
7322 0 5.0
7323 1 6.0
7324 2 NaN
7325 dtype: float64
7327 >>> ser.notna()
7328 0 True
7329 1 True
7330 2 False
7331 dtype: bool
7332 """
7334 @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
7335 def notna(self: FrameOrSeries) -> FrameOrSeries:
7336 return notna(self).__finalize__(self)
7338 @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
7339 def notnull(self: FrameOrSeries) -> FrameOrSeries:
7340 return notna(self).__finalize__(self)
7342 def _clip_with_scalar(self, lower, upper, inplace: bool_t = False):
7343 if (lower is not None and np.any(isna(lower))) or (
7344 upper is not None and np.any(isna(upper))
7345 ):
7346 raise ValueError("Cannot use an NA value as a clip threshold")
7348 result = self
7349 mask = isna(self.values)
7351 with np.errstate(all="ignore"):
7352 if upper is not None:
7353 subset = self.to_numpy() <= upper
7354 result = result.where(subset, upper, axis=None, inplace=False)
7355 if lower is not None:
7356 subset = self.to_numpy() >= lower
7357 result = result.where(subset, lower, axis=None, inplace=False)
7359 if np.any(mask):
7360 result[mask] = np.nan
7362 if inplace:
7363 self._update_inplace(result)
7364 else:
7365 return result
7367 def _clip_with_one_bound(self, threshold, method, axis, inplace):
7369 if axis is not None:
7370 axis = self._get_axis_number(axis)
7372 # method is self.le for upper bound and self.ge for lower bound
7373 if is_scalar(threshold) and is_number(threshold):
7374 if method.__name__ == "le":
7375 return self._clip_with_scalar(None, threshold, inplace=inplace)
7376 return self._clip_with_scalar(threshold, None, inplace=inplace)
7378 subset = method(threshold, axis=axis) | isna(self)
7380 # GH #15390
7381 # In order for where method to work, the threshold must
7382 # be transformed to NDFrame from other array like structure.
7383 if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold):
7384 if isinstance(self, ABCSeries):
7385 threshold = self._constructor(threshold, index=self.index)
7386 else:
7387 threshold = _align_method_FRAME(self, threshold, axis)
7388 return self.where(subset, threshold, axis=axis, inplace=inplace)
7390 def clip(
7391 self: FrameOrSeries,
7392 lower=None,
7393 upper=None,
7394 axis=None,
7395 inplace: bool_t = False,
7396 *args,
7397 **kwargs,
7398 ) -> FrameOrSeries:
7399 """
7400 Trim values at input threshold(s).
7402 Assigns values outside boundary to boundary values. Thresholds
7403 can be singular values or array like, and in the latter case
7404 the clipping is performed element-wise in the specified axis.
7406 Parameters
7407 ----------
7408 lower : float or array_like, default None
7409 Minimum threshold value. All values below this
7410 threshold will be set to it.
7411 upper : float or array_like, default None
7412 Maximum threshold value. All values above this
7413 threshold will be set to it.
7414 axis : int or str axis name, optional
7415 Align object with lower and upper along the given axis.
7416 inplace : bool, default False
7417 Whether to perform the operation in place on the data.
7419 .. versionadded:: 0.21.0
7420 *args, **kwargs
7421 Additional keywords have no effect but might be accepted
7422 for compatibility with numpy.
7424 Returns
7425 -------
7426 Series or DataFrame
7427 Same type as calling object with the values outside the
7428 clip boundaries replaced.
7430 Examples
7431 --------
7432 >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}
7433 >>> df = pd.DataFrame(data)
7434 >>> df
7435 col_0 col_1
7436 0 9 -2
7437 1 -3 -7
7438 2 0 6
7439 3 -1 8
7440 4 5 -5
7442 Clips per column using lower and upper thresholds:
7444 >>> df.clip(-4, 6)
7445 col_0 col_1
7446 0 6 -2
7447 1 -3 -4
7448 2 0 6
7449 3 -1 6
7450 4 5 -4
7452 Clips using specific lower and upper thresholds per column element:
7454 >>> t = pd.Series([2, -4, -1, 6, 3])
7455 >>> t
7456 0 2
7457 1 -4
7458 2 -1
7459 3 6
7460 4 3
7461 dtype: int64
7463 >>> df.clip(t, t + 4, axis=0)
7464 col_0 col_1
7465 0 6 2
7466 1 -3 -4
7467 2 0 3
7468 3 6 8
7469 4 5 3
7470 """
7471 inplace = validate_bool_kwarg(inplace, "inplace")
7473 axis = nv.validate_clip_with_axis(axis, args, kwargs)
7474 if axis is not None:
7475 axis = self._get_axis_number(axis)
7477 # GH 17276
7478 # numpy doesn't like NaN as a clip value
7479 # so ignore
7480 # GH 19992
7481 # numpy doesn't drop a list-like bound containing NaN
7482 if not is_list_like(lower) and np.any(isna(lower)):
7483 lower = None
7484 if not is_list_like(upper) and np.any(isna(upper)):
7485 upper = None
7487 # GH 2747 (arguments were reversed)
7488 if lower is not None and upper is not None:
7489 if is_scalar(lower) and is_scalar(upper):
7490 lower, upper = min(lower, upper), max(lower, upper)
7492 # fast-path for scalars
7493 if (lower is None or (is_scalar(lower) and is_number(lower))) and (
7494 upper is None or (is_scalar(upper) and is_number(upper))
7495 ):
7496 return self._clip_with_scalar(lower, upper, inplace=inplace)
7498 result = self
7499 if lower is not None:
7500 result = result._clip_with_one_bound(
7501 lower, method=self.ge, axis=axis, inplace=inplace
7502 )
7503 if upper is not None:
7504 if inplace:
7505 result = self
7506 result = result._clip_with_one_bound(
7507 upper, method=self.le, axis=axis, inplace=inplace
7508 )
7510 return result
7512 _shared_docs[
7513 "groupby"
7514 ] = """
7515 Group %(klass)s using a mapper or by a Series of columns.
7517 A groupby operation involves some combination of splitting the
7518 object, applying a function, and combining the results. This can be
7519 used to group large amounts of data and compute operations on these
7520 groups.
7522 Parameters
7523 ----------
7524 by : mapping, function, label, or list of labels
7525 Used to determine the groups for the groupby.
7526 If ``by`` is a function, it's called on each value of the object's
7527 index. If a dict or Series is passed, the Series or dict VALUES
7528 will be used to determine the groups (the Series' values are first
7529 aligned; see ``.align()`` method). If an ndarray is passed, the
7530 values are used as-is determine the groups. A label or list of
7531 labels may be passed to group by the columns in ``self``. Notice
7532 that a tuple is interpreted as a (single) key.
7533 axis : {0 or 'index', 1 or 'columns'}, default 0
7534 Split along rows (0) or columns (1).
7535 level : int, level name, or sequence of such, default None
7536 If the axis is a MultiIndex (hierarchical), group by a particular
7537 level or levels.
7538 as_index : bool, default True
7539 For aggregated output, return object with group labels as the
7540 index. Only relevant for DataFrame input. as_index=False is
7541 effectively "SQL-style" grouped output.
7542 sort : bool, default True
7543 Sort group keys. Get better performance by turning this off.
7544 Note this does not influence the order of observations within each
7545 group. Groupby preserves the order of rows within each group.
7546 group_keys : bool, default True
7547 When calling apply, add group keys to index to identify pieces.
7548 squeeze : bool, default False
7549 Reduce the dimensionality of the return type if possible,
7550 otherwise return a consistent type.
7551 observed : bool, default False
7552 This only applies if any of the groupers are Categoricals.
7553 If True: only show observed values for categorical groupers.
7554 If False: show all values for categorical groupers.
7556 .. versionadded:: 0.23.0
7558 Returns
7559 -------
7560 %(klass)sGroupBy
7561 Returns a groupby object that contains information about the groups.
7563 See Also
7564 --------
7565 resample : Convenience method for frequency conversion and resampling
7566 of time series.
7568 Notes
7569 -----
7570 See the `user guide
7571 <https://pandas.pydata.org/pandas-docs/stable/groupby.html>`_ for more.
7572 """
7574 def asfreq(
7575 self: FrameOrSeries,
7576 freq,
7577 method=None,
7578 how: Optional[str] = None,
7579 normalize: bool_t = False,
7580 fill_value=None,
7581 ) -> FrameOrSeries:
7582 """
7583 Convert TimeSeries to specified frequency.
7585 Optionally provide filling method to pad/backfill missing values.
7587 Returns the original data conformed to a new index with the specified
7588 frequency. ``resample`` is more appropriate if an operation, such as
7589 summarization, is necessary to represent the data at the new frequency.
7591 Parameters
7592 ----------
7593 freq : DateOffset or str
7594 method : {'backfill'/'bfill', 'pad'/'ffill'}, default None
7595 Method to use for filling holes in reindexed Series (note this
7596 does not fill NaNs that already were present):
7598 * 'pad' / 'ffill': propagate last valid observation forward to next
7599 valid
7600 * 'backfill' / 'bfill': use NEXT valid observation to fill.
7601 how : {'start', 'end'}, default end
7602 For PeriodIndex only (see PeriodIndex.asfreq).
7603 normalize : bool, default False
7604 Whether to reset output index to midnight.
7605 fill_value : scalar, optional
7606 Value to use for missing values, applied during upsampling (note
7607 this does not fill NaNs that already were present).
7609 Returns
7610 -------
7611 converted : same type as caller
7613 See Also
7614 --------
7615 reindex
7617 Notes
7618 -----
7619 To learn more about the frequency strings, please see `this link
7620 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
7622 Examples
7623 --------
7625 Start by creating a series with 4 one minute timestamps.
7627 >>> index = pd.date_range('1/1/2000', periods=4, freq='T')
7628 >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)
7629 >>> df = pd.DataFrame({'s':series})
7630 >>> df
7631 s
7632 2000-01-01 00:00:00 0.0
7633 2000-01-01 00:01:00 NaN
7634 2000-01-01 00:02:00 2.0
7635 2000-01-01 00:03:00 3.0
7637 Upsample the series into 30 second bins.
7639 >>> df.asfreq(freq='30S')
7640 s
7641 2000-01-01 00:00:00 0.0
7642 2000-01-01 00:00:30 NaN
7643 2000-01-01 00:01:00 NaN
7644 2000-01-01 00:01:30 NaN
7645 2000-01-01 00:02:00 2.0
7646 2000-01-01 00:02:30 NaN
7647 2000-01-01 00:03:00 3.0
7649 Upsample again, providing a ``fill value``.
7651 >>> df.asfreq(freq='30S', fill_value=9.0)
7652 s
7653 2000-01-01 00:00:00 0.0
7654 2000-01-01 00:00:30 9.0
7655 2000-01-01 00:01:00 NaN
7656 2000-01-01 00:01:30 9.0
7657 2000-01-01 00:02:00 2.0
7658 2000-01-01 00:02:30 9.0
7659 2000-01-01 00:03:00 3.0
7661 Upsample again, providing a ``method``.
7663 >>> df.asfreq(freq='30S', method='bfill')
7664 s
7665 2000-01-01 00:00:00 0.0
7666 2000-01-01 00:00:30 NaN
7667 2000-01-01 00:01:00 NaN
7668 2000-01-01 00:01:30 2.0
7669 2000-01-01 00:02:00 2.0
7670 2000-01-01 00:02:30 3.0
7671 2000-01-01 00:03:00 3.0
7672 """
7673 from pandas.core.resample import asfreq
7675 return asfreq(
7676 self,
7677 freq,
7678 method=method,
7679 how=how,
7680 normalize=normalize,
7681 fill_value=fill_value,
7682 )
7684 def at_time(
7685 self: FrameOrSeries, time, asof: bool_t = False, axis=None
7686 ) -> FrameOrSeries:
7687 """
7688 Select values at particular time of day (e.g. 9:30AM).
7690 Parameters
7691 ----------
7692 time : datetime.time or str
7693 axis : {0 or 'index', 1 or 'columns'}, default 0
7695 .. versionadded:: 0.24.0
7697 Returns
7698 -------
7699 Series or DataFrame
7701 Raises
7702 ------
7703 TypeError
7704 If the index is not a :class:`DatetimeIndex`
7706 See Also
7707 --------
7708 between_time : Select values between particular times of the day.
7709 first : Select initial periods of time series based on a date offset.
7710 last : Select final periods of time series based on a date offset.
7711 DatetimeIndex.indexer_at_time : Get just the index locations for
7712 values at particular time of the day.
7714 Examples
7715 --------
7716 >>> i = pd.date_range('2018-04-09', periods=4, freq='12H')
7717 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
7718 >>> ts
7719 A
7720 2018-04-09 00:00:00 1
7721 2018-04-09 12:00:00 2
7722 2018-04-10 00:00:00 3
7723 2018-04-10 12:00:00 4
7725 >>> ts.at_time('12:00')
7726 A
7727 2018-04-09 12:00:00 2
7728 2018-04-10 12:00:00 4
7729 """
7730 if axis is None:
7731 axis = self._stat_axis_number
7732 axis = self._get_axis_number(axis)
7734 index = self._get_axis(axis)
7735 try:
7736 indexer = index.indexer_at_time(time, asof=asof)
7737 except AttributeError:
7738 raise TypeError("Index must be DatetimeIndex")
7740 return self._take_with_is_copy(indexer, axis=axis)
7742 def between_time(
7743 self: FrameOrSeries,
7744 start_time,
7745 end_time,
7746 include_start: bool_t = True,
7747 include_end: bool_t = True,
7748 axis=None,
7749 ) -> FrameOrSeries:
7750 """
7751 Select values between particular times of the day (e.g., 9:00-9:30 AM).
7753 By setting ``start_time`` to be later than ``end_time``,
7754 you can get the times that are *not* between the two times.
7756 Parameters
7757 ----------
7758 start_time : datetime.time or str
7759 end_time : datetime.time or str
7760 include_start : bool, default True
7761 include_end : bool, default True
7762 axis : {0 or 'index', 1 or 'columns'}, default 0
7764 .. versionadded:: 0.24.0
7766 Returns
7767 -------
7768 Series or DataFrame
7770 Raises
7771 ------
7772 TypeError
7773 If the index is not a :class:`DatetimeIndex`
7775 See Also
7776 --------
7777 at_time : Select values at a particular time of the day.
7778 first : Select initial periods of time series based on a date offset.
7779 last : Select final periods of time series based on a date offset.
7780 DatetimeIndex.indexer_between_time : Get just the index locations for
7781 values between particular times of the day.
7783 Examples
7784 --------
7785 >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')
7786 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
7787 >>> ts
7788 A
7789 2018-04-09 00:00:00 1
7790 2018-04-10 00:20:00 2
7791 2018-04-11 00:40:00 3
7792 2018-04-12 01:00:00 4
7794 >>> ts.between_time('0:15', '0:45')
7795 A
7796 2018-04-10 00:20:00 2
7797 2018-04-11 00:40:00 3
7799 You get the times that are *not* between two times by setting
7800 ``start_time`` later than ``end_time``:
7802 >>> ts.between_time('0:45', '0:15')
7803 A
7804 2018-04-09 00:00:00 1
7805 2018-04-12 01:00:00 4
7806 """
7807 if axis is None:
7808 axis = self._stat_axis_number
7809 axis = self._get_axis_number(axis)
7811 index = self._get_axis(axis)
7812 try:
7813 indexer = index.indexer_between_time(
7814 start_time,
7815 end_time,
7816 include_start=include_start,
7817 include_end=include_end,
7818 )
7819 except AttributeError:
7820 raise TypeError("Index must be DatetimeIndex")
7822 return self._take_with_is_copy(indexer, axis=axis)
7824 def resample(
7825 self,
7826 rule,
7827 axis=0,
7828 closed: Optional[str] = None,
7829 label: Optional[str] = None,
7830 convention: str = "start",
7831 kind: Optional[str] = None,
7832 loffset=None,
7833 base: int = 0,
7834 on=None,
7835 level=None,
7836 ):
7837 """
7838 Resample time-series data.
7840 Convenience method for frequency conversion and resampling of time
7841 series. Object must have a datetime-like index (`DatetimeIndex`,
7842 `PeriodIndex`, or `TimedeltaIndex`), or pass datetime-like values
7843 to the `on` or `level` keyword.
7845 Parameters
7846 ----------
7847 rule : DateOffset, Timedelta or str
7848 The offset string or object representing target conversion.
7849 axis : {0 or 'index', 1 or 'columns'}, default 0
7850 Which axis to use for up- or down-sampling. For `Series` this
7851 will default to 0, i.e. along the rows. Must be
7852 `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.
7853 closed : {'right', 'left'}, default None
7854 Which side of bin interval is closed. The default is 'left'
7855 for all frequency offsets except for 'M', 'A', 'Q', 'BM',
7856 'BA', 'BQ', and 'W' which all have a default of 'right'.
7857 label : {'right', 'left'}, default None
7858 Which bin edge label to label bucket with. The default is 'left'
7859 for all frequency offsets except for 'M', 'A', 'Q', 'BM',
7860 'BA', 'BQ', and 'W' which all have a default of 'right'.
7861 convention : {'start', 'end', 's', 'e'}, default 'start'
7862 For `PeriodIndex` only, controls whether to use the start or
7863 end of `rule`.
7864 kind : {'timestamp', 'period'}, optional, default None
7865 Pass 'timestamp' to convert the resulting index to a
7866 `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`.
7867 By default the input representation is retained.
7868 loffset : timedelta, default None
7869 Adjust the resampled time labels.
7870 base : int, default 0
7871 For frequencies that evenly subdivide 1 day, the "origin" of the
7872 aggregated intervals. For example, for '5min' frequency, base could
7873 range from 0 through 4. Defaults to 0.
7874 on : str, optional
7875 For a DataFrame, column to use instead of index for resampling.
7876 Column must be datetime-like.
7878 level : str or int, optional
7879 For a MultiIndex, level (name or number) to use for
7880 resampling. `level` must be datetime-like.
7882 Returns
7883 -------
7884 Resampler object
7886 See Also
7887 --------
7888 groupby : Group by mapping, function, label, or list of labels.
7889 Series.resample : Resample a Series.
7890 DataFrame.resample: Resample a DataFrame.
7892 Notes
7893 -----
7894 See the `user guide
7895 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>`_
7896 for more.
7898 To learn more about the offset strings, please see `this link
7899 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`__.
7901 Examples
7902 --------
7904 Start by creating a series with 9 one minute timestamps.
7906 >>> index = pd.date_range('1/1/2000', periods=9, freq='T')
7907 >>> series = pd.Series(range(9), index=index)
7908 >>> series
7909 2000-01-01 00:00:00 0
7910 2000-01-01 00:01:00 1
7911 2000-01-01 00:02:00 2
7912 2000-01-01 00:03:00 3
7913 2000-01-01 00:04:00 4
7914 2000-01-01 00:05:00 5
7915 2000-01-01 00:06:00 6
7916 2000-01-01 00:07:00 7
7917 2000-01-01 00:08:00 8
7918 Freq: T, dtype: int64
7920 Downsample the series into 3 minute bins and sum the values
7921 of the timestamps falling into a bin.
7923 >>> series.resample('3T').sum()
7924 2000-01-01 00:00:00 3
7925 2000-01-01 00:03:00 12
7926 2000-01-01 00:06:00 21
7927 Freq: 3T, dtype: int64
7929 Downsample the series into 3 minute bins as above, but label each
7930 bin using the right edge instead of the left. Please note that the
7931 value in the bucket used as the label is not included in the bucket,
7932 which it labels. For example, in the original series the
7933 bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed
7934 value in the resampled bucket with the label ``2000-01-01 00:03:00``
7935 does not include 3 (if it did, the summed value would be 6, not 3).
7936 To include this value close the right side of the bin interval as
7937 illustrated in the example below this one.
7939 >>> series.resample('3T', label='right').sum()
7940 2000-01-01 00:03:00 3
7941 2000-01-01 00:06:00 12
7942 2000-01-01 00:09:00 21
7943 Freq: 3T, dtype: int64
7945 Downsample the series into 3 minute bins as above, but close the right
7946 side of the bin interval.
7948 >>> series.resample('3T', label='right', closed='right').sum()
7949 2000-01-01 00:00:00 0
7950 2000-01-01 00:03:00 6
7951 2000-01-01 00:06:00 15
7952 2000-01-01 00:09:00 15
7953 Freq: 3T, dtype: int64
7955 Upsample the series into 30 second bins.
7957 >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows
7958 2000-01-01 00:00:00 0.0
7959 2000-01-01 00:00:30 NaN
7960 2000-01-01 00:01:00 1.0
7961 2000-01-01 00:01:30 NaN
7962 2000-01-01 00:02:00 2.0
7963 Freq: 30S, dtype: float64
7965 Upsample the series into 30 second bins and fill the ``NaN``
7966 values using the ``pad`` method.
7968 >>> series.resample('30S').pad()[0:5]
7969 2000-01-01 00:00:00 0
7970 2000-01-01 00:00:30 0
7971 2000-01-01 00:01:00 1
7972 2000-01-01 00:01:30 1
7973 2000-01-01 00:02:00 2
7974 Freq: 30S, dtype: int64
7976 Upsample the series into 30 second bins and fill the
7977 ``NaN`` values using the ``bfill`` method.
7979 >>> series.resample('30S').bfill()[0:5]
7980 2000-01-01 00:00:00 0
7981 2000-01-01 00:00:30 1
7982 2000-01-01 00:01:00 1
7983 2000-01-01 00:01:30 2
7984 2000-01-01 00:02:00 2
7985 Freq: 30S, dtype: int64
7987 Pass a custom function via ``apply``
7989 >>> def custom_resampler(array_like):
7990 ... return np.sum(array_like) + 5
7991 ...
7992 >>> series.resample('3T').apply(custom_resampler)
7993 2000-01-01 00:00:00 8
7994 2000-01-01 00:03:00 17
7995 2000-01-01 00:06:00 26
7996 Freq: 3T, dtype: int64
7998 For a Series with a PeriodIndex, the keyword `convention` can be
7999 used to control whether to use the start or end of `rule`.
8001 Resample a year by quarter using 'start' `convention`. Values are
8002 assigned to the first quarter of the period.
8004 >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01',
8005 ... freq='A',
8006 ... periods=2))
8007 >>> s
8008 2012 1
8009 2013 2
8010 Freq: A-DEC, dtype: int64
8011 >>> s.resample('Q', convention='start').asfreq()
8012 2012Q1 1.0
8013 2012Q2 NaN
8014 2012Q3 NaN
8015 2012Q4 NaN
8016 2013Q1 2.0
8017 2013Q2 NaN
8018 2013Q3 NaN
8019 2013Q4 NaN
8020 Freq: Q-DEC, dtype: float64
8022 Resample quarters by month using 'end' `convention`. Values are
8023 assigned to the last month of the period.
8025 >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01',
8026 ... freq='Q',
8027 ... periods=4))
8028 >>> q
8029 2018Q1 1
8030 2018Q2 2
8031 2018Q3 3
8032 2018Q4 4
8033 Freq: Q-DEC, dtype: int64
8034 >>> q.resample('M', convention='end').asfreq()
8035 2018-03 1.0
8036 2018-04 NaN
8037 2018-05 NaN
8038 2018-06 2.0
8039 2018-07 NaN
8040 2018-08 NaN
8041 2018-09 3.0
8042 2018-10 NaN
8043 2018-11 NaN
8044 2018-12 4.0
8045 Freq: M, dtype: float64
8047 For DataFrame objects, the keyword `on` can be used to specify the
8048 column instead of the index for resampling.
8050 >>> d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
8051 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]})
8052 >>> df = pd.DataFrame(d)
8053 >>> df['week_starting'] = pd.date_range('01/01/2018',
8054 ... periods=8,
8055 ... freq='W')
8056 >>> df
8057 price volume week_starting
8058 0 10 50 2018-01-07
8059 1 11 60 2018-01-14
8060 2 9 40 2018-01-21
8061 3 13 100 2018-01-28
8062 4 14 50 2018-02-04
8063 5 18 100 2018-02-11
8064 6 17 40 2018-02-18
8065 7 19 50 2018-02-25
8066 >>> df.resample('M', on='week_starting').mean()
8067 price volume
8068 week_starting
8069 2018-01-31 10.75 62.5
8070 2018-02-28 17.00 60.0
8072 For a DataFrame with MultiIndex, the keyword `level` can be used to
8073 specify on which level the resampling needs to take place.
8075 >>> days = pd.date_range('1/1/2000', periods=4, freq='D')
8076 >>> d2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
8077 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]})
8078 >>> df2 = pd.DataFrame(d2,
8079 ... index=pd.MultiIndex.from_product([days,
8080 ... ['morning',
8081 ... 'afternoon']]
8082 ... ))
8083 >>> df2
8084 price volume
8085 2000-01-01 morning 10 50
8086 afternoon 11 60
8087 2000-01-02 morning 9 40
8088 afternoon 13 100
8089 2000-01-03 morning 14 50
8090 afternoon 18 100
8091 2000-01-04 morning 17 40
8092 afternoon 19 50
8093 >>> df2.resample('D', level=0).sum()
8094 price volume
8095 2000-01-01 21 110
8096 2000-01-02 22 140
8097 2000-01-03 32 150
8098 2000-01-04 36 90
8099 """
8101 from pandas.core.resample import resample
8103 axis = self._get_axis_number(axis)
8104 return resample(
8105 self,
8106 freq=rule,
8107 label=label,
8108 closed=closed,
8109 axis=axis,
8110 kind=kind,
8111 loffset=loffset,
8112 convention=convention,
8113 base=base,
8114 key=on,
8115 level=level,
8116 )
8118 def first(self: FrameOrSeries, offset) -> FrameOrSeries:
8119 """
8120 Method to subset initial periods of time series data based on a date offset.
8122 Parameters
8123 ----------
8124 offset : str, DateOffset, dateutil.relativedelta
8126 Returns
8127 -------
8128 subset : same type as caller
8130 Raises
8131 ------
8132 TypeError
8133 If the index is not a :class:`DatetimeIndex`
8135 See Also
8136 --------
8137 last : Select final periods of time series based on a date offset.
8138 at_time : Select values at a particular time of the day.
8139 between_time : Select values between particular times of the day.
8141 Examples
8142 --------
8143 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
8144 >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
8145 >>> ts
8146 A
8147 2018-04-09 1
8148 2018-04-11 2
8149 2018-04-13 3
8150 2018-04-15 4
8152 Get the rows for the first 3 days:
8154 >>> ts.first('3D')
8155 A
8156 2018-04-09 1
8157 2018-04-11 2
8159 Notice the data for 3 first calender days were returned, not the first
8160 3 days observed in the dataset, and therefore data for 2018-04-13 was
8161 not returned.
8162 """
8163 if not isinstance(self.index, DatetimeIndex):
8164 raise TypeError("'first' only supports a DatetimeIndex index")
8166 if len(self.index) == 0:
8167 return self
8169 offset = to_offset(offset)
8170 end_date = end = self.index[0] + offset
8172 # Tick-like, e.g. 3 weeks
8173 if not offset.is_anchored() and hasattr(offset, "_inc"):
8174 if end_date in self.index:
8175 end = self.index.searchsorted(end_date, side="left")
8176 return self.iloc[:end]
8178 return self.loc[:end]
8180 def last(self: FrameOrSeries, offset) -> FrameOrSeries:
8181 """
8182 Method to subset final periods of time series data based on a date offset.
8184 Parameters
8185 ----------
8186 offset : str, DateOffset, dateutil.relativedelta
8188 Returns
8189 -------
8190 subset : same type as caller
8192 Raises
8193 ------
8194 TypeError
8195 If the index is not a :class:`DatetimeIndex`
8197 See Also
8198 --------
8199 first : Select initial periods of time series based on a date offset.
8200 at_time : Select values at a particular time of the day.
8201 between_time : Select values between particular times of the day.
8203 Examples
8204 --------
8205 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
8206 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
8207 >>> ts
8208 A
8209 2018-04-09 1
8210 2018-04-11 2
8211 2018-04-13 3
8212 2018-04-15 4
8214 Get the rows for the last 3 days:
8216 >>> ts.last('3D')
8217 A
8218 2018-04-13 3
8219 2018-04-15 4
8221 Notice the data for 3 last calender days were returned, not the last
8222 3 observed days in the dataset, and therefore data for 2018-04-11 was
8223 not returned.
8224 """
8225 if not isinstance(self.index, DatetimeIndex):
8226 raise TypeError("'last' only supports a DatetimeIndex index")
8228 if len(self.index) == 0:
8229 return self
8231 offset = to_offset(offset)
8233 start_date = self.index[-1] - offset
8234 start = self.index.searchsorted(start_date, side="right")
8235 return self.iloc[start:]
8237 def rank(
8238 self: FrameOrSeries,
8239 axis=0,
8240 method: str = "average",
8241 numeric_only: Optional[bool_t] = None,
8242 na_option: str = "keep",
8243 ascending: bool_t = True,
8244 pct: bool_t = False,
8245 ) -> FrameOrSeries:
8246 """
8247 Compute numerical data ranks (1 through n) along axis.
8249 By default, equal values are assigned a rank that is the average of the
8250 ranks of those values.
8252 Parameters
8253 ----------
8254 axis : {0 or 'index', 1 or 'columns'}, default 0
8255 Index to direct ranking.
8256 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
8257 How to rank the group of records that have the same value (i.e. ties):
8259 * average: average rank of the group
8260 * min: lowest rank in the group
8261 * max: highest rank in the group
8262 * first: ranks assigned in order they appear in the array
8263 * dense: like 'min', but rank always increases by 1 between groups.
8265 numeric_only : bool, optional
8266 For DataFrame objects, rank only numeric columns if set to True.
8267 na_option : {'keep', 'top', 'bottom'}, default 'keep'
8268 How to rank NaN values:
8270 * keep: assign NaN rank to NaN values
8271 * top: assign smallest rank to NaN values if ascending
8272 * bottom: assign highest rank to NaN values if ascending.
8274 ascending : bool, default True
8275 Whether or not the elements should be ranked in ascending order.
8276 pct : bool, default False
8277 Whether or not to display the returned rankings in percentile
8278 form.
8280 Returns
8281 -------
8282 same type as caller
8283 Return a Series or DataFrame with data ranks as values.
8285 See Also
8286 --------
8287 core.groupby.GroupBy.rank : Rank of values within each group.
8289 Examples
8290 --------
8292 >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',
8293 ... 'spider', 'snake'],
8294 ... 'Number_legs': [4, 2, 4, 8, np.nan]})
8295 >>> df
8296 Animal Number_legs
8297 0 cat 4.0
8298 1 penguin 2.0
8299 2 dog 4.0
8300 3 spider 8.0
8301 4 snake NaN
8303 The following example shows how the method behaves with the above
8304 parameters:
8306 * default_rank: this is the default behaviour obtained without using
8307 any parameter.
8308 * max_rank: setting ``method = 'max'`` the records that have the
8309 same values are ranked using the highest rank (e.g.: since 'cat'
8310 and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)
8311 * NA_bottom: choosing ``na_option = 'bottom'``, if there are records
8312 with NaN values they are placed at the bottom of the ranking.
8313 * pct_rank: when setting ``pct = True``, the ranking is expressed as
8314 percentile rank.
8316 >>> df['default_rank'] = df['Number_legs'].rank()
8317 >>> df['max_rank'] = df['Number_legs'].rank(method='max')
8318 >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')
8319 >>> df['pct_rank'] = df['Number_legs'].rank(pct=True)
8320 >>> df
8321 Animal Number_legs default_rank max_rank NA_bottom pct_rank
8322 0 cat 4.0 2.5 3.0 2.5 0.625
8323 1 penguin 2.0 1.0 1.0 1.0 0.250
8324 2 dog 4.0 2.5 3.0 2.5 0.625
8325 3 spider 8.0 4.0 4.0 4.0 1.000
8326 4 snake NaN NaN NaN 5.0 NaN
8327 """
8328 axis = self._get_axis_number(axis)
8330 if na_option not in {"keep", "top", "bottom"}:
8331 msg = "na_option must be one of 'keep', 'top', or 'bottom'"
8332 raise ValueError(msg)
8334 def ranker(data):
8335 ranks = algos.rank(
8336 data.values,
8337 axis=axis,
8338 method=method,
8339 ascending=ascending,
8340 na_option=na_option,
8341 pct=pct,
8342 )
8343 ranks = self._constructor(ranks, **data._construct_axes_dict())
8344 return ranks.__finalize__(self)
8346 # if numeric_only is None, and we can't get anything, we try with
8347 # numeric_only=True
8348 if numeric_only is None:
8349 try:
8350 return ranker(self)
8351 except TypeError:
8352 numeric_only = True
8354 if numeric_only:
8355 data = self._get_numeric_data()
8356 else:
8357 data = self
8359 return ranker(data)
8361 _shared_docs[
8362 "align"
8363 ] = """
8364 Align two objects on their axes with the specified join method.
8366 Join method is specified for each axis Index.
8368 Parameters
8369 ----------
8370 other : DataFrame or Series
8371 join : {'outer', 'inner', 'left', 'right'}, default 'outer'
8372 axis : allowed axis of the other object, default None
8373 Align on index (0), columns (1), or both (None).
8374 level : int or level name, default None
8375 Broadcast across a level, matching Index values on the
8376 passed MultiIndex level.
8377 copy : bool, default True
8378 Always returns new objects. If copy=False and no reindexing is
8379 required then original objects are returned.
8380 fill_value : scalar, default np.NaN
8381 Value to use for missing values. Defaults to NaN, but can be any
8382 "compatible" value.
8383 method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
8384 Method to use for filling holes in reindexed Series:
8386 - pad / ffill: propagate last valid observation forward to next valid.
8387 - backfill / bfill: use NEXT valid observation to fill gap.
8389 limit : int, default None
8390 If method is specified, this is the maximum number of consecutive
8391 NaN values to forward/backward fill. In other words, if there is
8392 a gap with more than this number of consecutive NaNs, it will only
8393 be partially filled. If method is not specified, this is the
8394 maximum number of entries along the entire axis where NaNs will be
8395 filled. Must be greater than 0 if not None.
8396 fill_axis : %(axes_single_arg)s, default 0
8397 Filling axis, method and limit.
8398 broadcast_axis : %(axes_single_arg)s, default None
8399 Broadcast values along this axis, if aligning two objects of
8400 different dimensions.
8402 Returns
8403 -------
8404 (left, right) : (%(klass)s, type of other)
8405 Aligned objects.
8406 """
8408 @Appender(_shared_docs["align"] % _shared_doc_kwargs)
8409 def align(
8410 self,
8411 other,
8412 join="outer",
8413 axis=None,
8414 level=None,
8415 copy=True,
8416 fill_value=None,
8417 method=None,
8418 limit=None,
8419 fill_axis=0,
8420 broadcast_axis=None,
8421 ):
8422 method = missing.clean_fill_method(method)
8424 if broadcast_axis == 1 and self.ndim != other.ndim:
8425 if isinstance(self, ABCSeries):
8426 # this means other is a DataFrame, and we need to broadcast
8427 # self
8428 cons = self._constructor_expanddim
8429 df = cons(
8430 {c: self for c in other.columns}, **other._construct_axes_dict()
8431 )
8432 return df._align_frame(
8433 other,
8434 join=join,
8435 axis=axis,
8436 level=level,
8437 copy=copy,
8438 fill_value=fill_value,
8439 method=method,
8440 limit=limit,
8441 fill_axis=fill_axis,
8442 )
8443 elif isinstance(other, ABCSeries):
8444 # this means self is a DataFrame, and we need to broadcast
8445 # other
8446 cons = other._constructor_expanddim
8447 df = cons(
8448 {c: other for c in self.columns}, **self._construct_axes_dict()
8449 )
8450 return self._align_frame(
8451 df,
8452 join=join,
8453 axis=axis,
8454 level=level,
8455 copy=copy,
8456 fill_value=fill_value,
8457 method=method,
8458 limit=limit,
8459 fill_axis=fill_axis,
8460 )
8462 if axis is not None:
8463 axis = self._get_axis_number(axis)
8464 if isinstance(other, ABCDataFrame):
8465 return self._align_frame(
8466 other,
8467 join=join,
8468 axis=axis,
8469 level=level,
8470 copy=copy,
8471 fill_value=fill_value,
8472 method=method,
8473 limit=limit,
8474 fill_axis=fill_axis,
8475 )
8476 elif isinstance(other, ABCSeries):
8477 return self._align_series(
8478 other,
8479 join=join,
8480 axis=axis,
8481 level=level,
8482 copy=copy,
8483 fill_value=fill_value,
8484 method=method,
8485 limit=limit,
8486 fill_axis=fill_axis,
8487 )
8488 else: # pragma: no cover
8489 raise TypeError(f"unsupported type: {type(other)}")
8491 def _align_frame(
8492 self,
8493 other,
8494 join="outer",
8495 axis=None,
8496 level=None,
8497 copy: bool_t = True,
8498 fill_value=None,
8499 method=None,
8500 limit=None,
8501 fill_axis=0,
8502 ):
8503 # defaults
8504 join_index, join_columns = None, None
8505 ilidx, iridx = None, None
8506 clidx, cridx = None, None
8508 is_series = isinstance(self, ABCSeries)
8510 if axis is None or axis == 0:
8511 if not self.index.equals(other.index):
8512 join_index, ilidx, iridx = self.index.join(
8513 other.index, how=join, level=level, return_indexers=True
8514 )
8516 if axis is None or axis == 1:
8517 if not is_series and not self.columns.equals(other.columns):
8518 join_columns, clidx, cridx = self.columns.join(
8519 other.columns, how=join, level=level, return_indexers=True
8520 )
8522 if is_series:
8523 reindexers = {0: [join_index, ilidx]}
8524 else:
8525 reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}
8527 left = self._reindex_with_indexers(
8528 reindexers, copy=copy, fill_value=fill_value, allow_dups=True
8529 )
8530 # other must be always DataFrame
8531 right = other._reindex_with_indexers(
8532 {0: [join_index, iridx], 1: [join_columns, cridx]},
8533 copy=copy,
8534 fill_value=fill_value,
8535 allow_dups=True,
8536 )
8538 if method is not None:
8539 _left = left.fillna(method=method, axis=fill_axis, limit=limit)
8540 assert _left is not None # needed for mypy
8541 left = _left
8542 right = right.fillna(method=method, axis=fill_axis, limit=limit)
8544 # if DatetimeIndex have different tz, convert to UTC
8545 if is_datetime64tz_dtype(left.index):
8546 if left.index.tz != right.index.tz:
8547 if join_index is not None:
8548 left.index = join_index
8549 right.index = join_index
8551 return left.__finalize__(self), right.__finalize__(other)
8553 def _align_series(
8554 self,
8555 other,
8556 join="outer",
8557 axis=None,
8558 level=None,
8559 copy: bool_t = True,
8560 fill_value=None,
8561 method=None,
8562 limit=None,
8563 fill_axis=0,
8564 ):
8566 is_series = isinstance(self, ABCSeries)
8568 # series/series compat, other must always be a Series
8569 if is_series:
8570 if axis:
8571 raise ValueError("cannot align series to a series other than axis 0")
8573 # equal
8574 if self.index.equals(other.index):
8575 join_index, lidx, ridx = None, None, None
8576 else:
8577 join_index, lidx, ridx = self.index.join(
8578 other.index, how=join, level=level, return_indexers=True
8579 )
8581 left = self._reindex_indexer(join_index, lidx, copy)
8582 right = other._reindex_indexer(join_index, ridx, copy)
8584 else:
8585 # one has > 1 ndim
8586 fdata = self._data
8587 if axis == 0:
8588 join_index = self.index
8589 lidx, ridx = None, None
8590 if not self.index.equals(other.index):
8591 join_index, lidx, ridx = self.index.join(
8592 other.index, how=join, level=level, return_indexers=True
8593 )
8595 if lidx is not None:
8596 fdata = fdata.reindex_indexer(join_index, lidx, axis=1)
8598 elif axis == 1:
8599 join_index = self.columns
8600 lidx, ridx = None, None
8601 if not self.columns.equals(other.index):
8602 join_index, lidx, ridx = self.columns.join(
8603 other.index, how=join, level=level, return_indexers=True
8604 )
8606 if lidx is not None:
8607 fdata = fdata.reindex_indexer(join_index, lidx, axis=0)
8608 else:
8609 raise ValueError("Must specify axis=0 or 1")
8611 if copy and fdata is self._data:
8612 fdata = fdata.copy()
8614 left = self._constructor(fdata)
8616 if ridx is None:
8617 right = other
8618 else:
8619 right = other.reindex(join_index, level=level)
8621 # fill
8622 fill_na = notna(fill_value) or (method is not None)
8623 if fill_na:
8624 left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis)
8625 right = right.fillna(fill_value, method=method, limit=limit)
8627 # if DatetimeIndex have different tz, convert to UTC
8628 if is_series or (not is_series and axis == 0):
8629 if is_datetime64tz_dtype(left.index):
8630 if left.index.tz != right.index.tz:
8631 if join_index is not None:
8632 left.index = join_index
8633 right.index = join_index
8635 return left.__finalize__(self), right.__finalize__(other)
8637 def _where(
8638 self,
8639 cond,
8640 other=np.nan,
8641 inplace=False,
8642 axis=None,
8643 level=None,
8644 errors="raise",
8645 try_cast=False,
8646 ):
8647 """
8648 Equivalent to public method `where`, except that `other` is not
8649 applied as a function even if callable. Used in __setitem__.
8650 """
8651 inplace = validate_bool_kwarg(inplace, "inplace")
8653 # align the cond to same shape as myself
8654 cond = com.apply_if_callable(cond, self)
8655 if isinstance(cond, NDFrame):
8656 cond, _ = cond.align(self, join="right", broadcast_axis=1)
8657 else:
8658 if not hasattr(cond, "shape"):
8659 cond = np.asanyarray(cond)
8660 if cond.shape != self.shape:
8661 raise ValueError("Array conditional must be same shape as self")
8662 cond = self._constructor(cond, **self._construct_axes_dict())
8664 # make sure we are boolean
8665 fill_value = bool(inplace)
8666 cond = cond.fillna(fill_value)
8668 msg = "Boolean array expected for the condition, not {dtype}"
8670 if not isinstance(cond, ABCDataFrame):
8671 # This is a single-dimensional object.
8672 if not is_bool_dtype(cond):
8673 raise ValueError(msg.format(dtype=cond.dtype))
8674 elif not cond.empty:
8675 for dt in cond.dtypes:
8676 if not is_bool_dtype(dt):
8677 raise ValueError(msg.format(dtype=dt))
8679 cond = -cond if inplace else cond
8681 # try to align with other
8682 try_quick = True
8683 if hasattr(other, "align"):
8685 # align with me
8686 if other.ndim <= self.ndim:
8688 _, other = self.align(
8689 other, join="left", axis=axis, level=level, fill_value=np.nan
8690 )
8692 # if we are NOT aligned, raise as we cannot where index
8693 if axis is None and not all(
8694 other._get_axis(i).equals(ax) for i, ax in enumerate(self.axes)
8695 ):
8696 raise InvalidIndexError
8698 # slice me out of the other
8699 else:
8700 raise NotImplementedError(
8701 "cannot align with a higher dimensional NDFrame"
8702 )
8704 if isinstance(other, np.ndarray):
8706 if other.shape != self.shape:
8708 if self.ndim == 1:
8710 icond = cond.values
8712 # GH 2745 / GH 4192
8713 # treat like a scalar
8714 if len(other) == 1:
8715 other = np.array(other[0])
8717 # GH 3235
8718 # match True cond to other
8719 elif len(cond[icond]) == len(other):
8721 # try to not change dtype at first (if try_quick)
8722 if try_quick:
8723 new_other = com.values_from_object(self)
8724 new_other = new_other.copy()
8725 new_other[icond] = other
8726 other = new_other
8728 else:
8729 raise ValueError(
8730 "Length of replacements must equal series length"
8731 )
8733 else:
8734 raise ValueError(
8735 "other must be the same shape as self when an ndarray"
8736 )
8738 # we are the same shape, so create an actual object for alignment
8739 else:
8740 other = self._constructor(other, **self._construct_axes_dict())
8742 if axis is None:
8743 axis = 0
8745 if self.ndim == getattr(other, "ndim", 0):
8746 align = True
8747 else:
8748 align = self._get_axis_number(axis) == 1
8750 block_axis = self._get_block_manager_axis(axis)
8752 if inplace:
8753 # we may have different type blocks come out of putmask, so
8754 # reconstruct the block manager
8756 self._check_inplace_setting(other)
8757 new_data = self._data.putmask(
8758 mask=cond,
8759 new=other,
8760 align=align,
8761 inplace=True,
8762 axis=block_axis,
8763 transpose=self._AXIS_REVERSED,
8764 )
8765 self._update_inplace(new_data)
8767 else:
8768 new_data = self._data.where(
8769 other=other,
8770 cond=cond,
8771 align=align,
8772 errors=errors,
8773 try_cast=try_cast,
8774 axis=block_axis,
8775 )
8777 return self._constructor(new_data).__finalize__(self)
8779 _shared_docs[
8780 "where"
8781 ] = """
8782 Replace values where the condition is %(cond_rev)s.
8784 Parameters
8785 ----------
8786 cond : bool %(klass)s, array-like, or callable
8787 Where `cond` is %(cond)s, keep the original value. Where
8788 %(cond_rev)s, replace with corresponding value from `other`.
8789 If `cond` is callable, it is computed on the %(klass)s and
8790 should return boolean %(klass)s or array. The callable must
8791 not change input %(klass)s (though pandas doesn't check it).
8792 other : scalar, %(klass)s, or callable
8793 Entries where `cond` is %(cond_rev)s are replaced with
8794 corresponding value from `other`.
8795 If other is callable, it is computed on the %(klass)s and
8796 should return scalar or %(klass)s. The callable must not
8797 change input %(klass)s (though pandas doesn't check it).
8798 inplace : bool, default False
8799 Whether to perform the operation in place on the data.
8800 axis : int, default None
8801 Alignment axis if needed.
8802 level : int, default None
8803 Alignment level if needed.
8804 errors : str, {'raise', 'ignore'}, default 'raise'
8805 Note that currently this parameter won't affect
8806 the results and will always coerce to a suitable dtype.
8808 - 'raise' : allow exceptions to be raised.
8809 - 'ignore' : suppress exceptions. On error return original object.
8811 try_cast : bool, default False
8812 Try to cast the result back to the input type (if possible).
8814 Returns
8815 -------
8816 Same type as caller
8818 See Also
8819 --------
8820 :func:`DataFrame.%(name_other)s` : Return an object of same shape as
8821 self.
8823 Notes
8824 -----
8825 The %(name)s method is an application of the if-then idiom. For each
8826 element in the calling DataFrame, if ``cond`` is ``%(cond)s`` the
8827 element is used; otherwise the corresponding element from the DataFrame
8828 ``other`` is used.
8830 The signature for :func:`DataFrame.where` differs from
8831 :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to
8832 ``np.where(m, df1, df2)``.
8834 For further details and examples see the ``%(name)s`` documentation in
8835 :ref:`indexing <indexing.where_mask>`.
8837 Examples
8838 --------
8839 >>> s = pd.Series(range(5))
8840 >>> s.where(s > 0)
8841 0 NaN
8842 1 1.0
8843 2 2.0
8844 3 3.0
8845 4 4.0
8846 dtype: float64
8848 >>> s.mask(s > 0)
8849 0 0.0
8850 1 NaN
8851 2 NaN
8852 3 NaN
8853 4 NaN
8854 dtype: float64
8856 >>> s.where(s > 1, 10)
8857 0 10
8858 1 10
8859 2 2
8860 3 3
8861 4 4
8862 dtype: int64
8864 >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
8865 >>> df
8866 A B
8867 0 0 1
8868 1 2 3
8869 2 4 5
8870 3 6 7
8871 4 8 9
8872 >>> m = df %% 3 == 0
8873 >>> df.where(m, -df)
8874 A B
8875 0 0 -1
8876 1 -2 3
8877 2 -4 -5
8878 3 6 -7
8879 4 -8 9
8880 >>> df.where(m, -df) == np.where(m, df, -df)
8881 A B
8882 0 True True
8883 1 True True
8884 2 True True
8885 3 True True
8886 4 True True
8887 >>> df.where(m, -df) == df.mask(~m, -df)
8888 A B
8889 0 True True
8890 1 True True
8891 2 True True
8892 3 True True
8893 4 True True
8894 """
8896 @Appender(
8897 _shared_docs["where"]
8898 % dict(
8899 _shared_doc_kwargs,
8900 cond="True",
8901 cond_rev="False",
8902 name="where",
8903 name_other="mask",
8904 )
8905 )
8906 def where(
8907 self,
8908 cond,
8909 other=np.nan,
8910 inplace=False,
8911 axis=None,
8912 level=None,
8913 errors="raise",
8914 try_cast=False,
8915 ):
8917 other = com.apply_if_callable(other, self)
8918 return self._where(
8919 cond, other, inplace, axis, level, errors=errors, try_cast=try_cast
8920 )
8922 @Appender(
8923 _shared_docs["where"]
8924 % dict(
8925 _shared_doc_kwargs,
8926 cond="False",
8927 cond_rev="True",
8928 name="mask",
8929 name_other="where",
8930 )
8931 )
8932 def mask(
8933 self,
8934 cond,
8935 other=np.nan,
8936 inplace=False,
8937 axis=None,
8938 level=None,
8939 errors="raise",
8940 try_cast=False,
8941 ):
8943 inplace = validate_bool_kwarg(inplace, "inplace")
8944 cond = com.apply_if_callable(cond, self)
8946 # see gh-21891
8947 if not hasattr(cond, "__invert__"):
8948 cond = np.array(cond)
8950 return self.where(
8951 ~cond,
8952 other=other,
8953 inplace=inplace,
8954 axis=axis,
8955 level=level,
8956 try_cast=try_cast,
8957 errors=errors,
8958 )
8960 _shared_docs[
8961 "shift"
8962 ] = """
8963 Shift index by desired number of periods with an optional time `freq`.
8965 When `freq` is not passed, shift the index without realigning the data.
8966 If `freq` is passed (in this case, the index must be date or datetime,
8967 or it will raise a `NotImplementedError`), the index will be
8968 increased using the periods and the `freq`.
8970 Parameters
8971 ----------
8972 periods : int
8973 Number of periods to shift. Can be positive or negative.
8974 freq : DateOffset, tseries.offsets, timedelta, or str, optional
8975 Offset to use from the tseries module or time rule (e.g. 'EOM').
8976 If `freq` is specified then the index values are shifted but the
8977 data is not realigned. That is, use `freq` if you would like to
8978 extend the index when shifting and preserve the original data.
8979 axis : {0 or 'index', 1 or 'columns', None}, default None
8980 Shift direction.
8981 fill_value : object, optional
8982 The scalar value to use for newly introduced missing values.
8983 the default depends on the dtype of `self`.
8984 For numeric data, ``np.nan`` is used.
8985 For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
8986 For extension dtypes, ``self.dtype.na_value`` is used.
8988 .. versionchanged:: 0.24.0
8990 Returns
8991 -------
8992 %(klass)s
8993 Copy of input object, shifted.
8995 See Also
8996 --------
8997 Index.shift : Shift values of Index.
8998 DatetimeIndex.shift : Shift values of DatetimeIndex.
8999 PeriodIndex.shift : Shift values of PeriodIndex.
9000 tshift : Shift the time index, using the index's frequency if
9001 available.
9003 Examples
9004 --------
9005 >>> df = pd.DataFrame({'Col1': [10, 20, 15, 30, 45],
9006 ... 'Col2': [13, 23, 18, 33, 48],
9007 ... 'Col3': [17, 27, 22, 37, 52]})
9009 >>> df.shift(periods=3)
9010 Col1 Col2 Col3
9011 0 NaN NaN NaN
9012 1 NaN NaN NaN
9013 2 NaN NaN NaN
9014 3 10.0 13.0 17.0
9015 4 20.0 23.0 27.0
9017 >>> df.shift(periods=1, axis='columns')
9018 Col1 Col2 Col3
9019 0 NaN 10.0 13.0
9020 1 NaN 20.0 23.0
9021 2 NaN 15.0 18.0
9022 3 NaN 30.0 33.0
9023 4 NaN 45.0 48.0
9025 >>> df.shift(periods=3, fill_value=0)
9026 Col1 Col2 Col3
9027 0 0 0 0
9028 1 0 0 0
9029 2 0 0 0
9030 3 10 13 17
9031 4 20 23 27
9032 """
9034 @Appender(_shared_docs["shift"] % _shared_doc_kwargs)
9035 def shift(
9036 self: FrameOrSeries, periods=1, freq=None, axis=0, fill_value=None
9037 ) -> FrameOrSeries:
9038 if periods == 0:
9039 return self.copy()
9041 block_axis = self._get_block_manager_axis(axis)
9042 if freq is None:
9043 new_data = self._data.shift(
9044 periods=periods, axis=block_axis, fill_value=fill_value
9045 )
9046 else:
9047 return self.tshift(periods, freq)
9049 return self._constructor(new_data).__finalize__(self)
9051 def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries:
9052 """
9053 Equivalent to `shift` without copying data.
9055 The shifted data will not include the dropped periods and the
9056 shifted axis will be smaller than the original.
9058 Parameters
9059 ----------
9060 periods : int
9061 Number of periods to move, can be positive or negative.
9063 Returns
9064 -------
9065 shifted : same type as caller
9067 Notes
9068 -----
9069 While the `slice_shift` is faster than `shift`, you may pay for it
9070 later during alignment.
9071 """
9072 if periods == 0:
9073 return self
9075 if periods > 0:
9076 vslicer = slice(None, -periods)
9077 islicer = slice(periods, None)
9078 else:
9079 vslicer = slice(-periods, None)
9080 islicer = slice(None, periods)
9082 new_obj = self._slice(vslicer, axis=axis)
9083 shifted_axis = self._get_axis(axis)[islicer]
9084 new_obj.set_axis(shifted_axis, axis=axis, inplace=True)
9086 return new_obj.__finalize__(self)
9088 def tshift(
9089 self: FrameOrSeries, periods: int = 1, freq=None, axis=0
9090 ) -> FrameOrSeries:
9091 """
9092 Shift the time index, using the index's frequency if available.
9094 Parameters
9095 ----------
9096 periods : int
9097 Number of periods to move, can be positive or negative.
9098 freq : DateOffset, timedelta, or str, default None
9099 Increment to use from the tseries module
9100 or time rule expressed as a string (e.g. 'EOM').
9101 axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0
9102 Corresponds to the axis that contains the Index.
9104 Returns
9105 -------
9106 shifted : Series/DataFrame
9108 Notes
9109 -----
9110 If freq is not specified then tries to use the freq or inferred_freq
9111 attributes of the index. If neither of those attributes exist, a
9112 ValueError is thrown
9113 """
9115 index = self._get_axis(axis)
9116 if freq is None:
9117 freq = getattr(index, "freq", None)
9119 if freq is None:
9120 freq = getattr(index, "inferred_freq", None)
9122 if freq is None:
9123 msg = "Freq was not given and was not set in the index"
9124 raise ValueError(msg)
9126 if periods == 0:
9127 return self
9129 if isinstance(freq, str):
9130 freq = to_offset(freq)
9132 block_axis = self._get_block_manager_axis(axis)
9133 if isinstance(index, PeriodIndex):
9134 orig_freq = to_offset(index.freq)
9135 if freq == orig_freq:
9136 new_data = self._data.copy()
9137 new_data.axes[block_axis] = index.shift(periods)
9138 elif orig_freq is not None:
9139 msg = (
9140 f"Given freq {freq.rule_code} does not match"
9141 f" PeriodIndex freq {orig_freq.rule_code}"
9142 )
9143 raise ValueError(msg)
9144 else:
9145 new_data = self._data.copy()
9146 new_data.axes[block_axis] = index.shift(periods, freq)
9148 return self._constructor(new_data).__finalize__(self)
9150 def truncate(
9151 self: FrameOrSeries, before=None, after=None, axis=None, copy: bool_t = True
9152 ) -> FrameOrSeries:
9153 """
9154 Truncate a Series or DataFrame before and after some index value.
9156 This is a useful shorthand for boolean indexing based on index
9157 values above or below certain thresholds.
9159 Parameters
9160 ----------
9161 before : date, str, int
9162 Truncate all rows before this index value.
9163 after : date, str, int
9164 Truncate all rows after this index value.
9165 axis : {0 or 'index', 1 or 'columns'}, optional
9166 Axis to truncate. Truncates the index (rows) by default.
9167 copy : bool, default is True,
9168 Return a copy of the truncated section.
9170 Returns
9171 -------
9172 type of caller
9173 The truncated Series or DataFrame.
9175 See Also
9176 --------
9177 DataFrame.loc : Select a subset of a DataFrame by label.
9178 DataFrame.iloc : Select a subset of a DataFrame by position.
9180 Notes
9181 -----
9182 If the index being truncated contains only datetime values,
9183 `before` and `after` may be specified as strings instead of
9184 Timestamps.
9186 Examples
9187 --------
9188 >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
9189 ... 'B': ['f', 'g', 'h', 'i', 'j'],
9190 ... 'C': ['k', 'l', 'm', 'n', 'o']},
9191 ... index=[1, 2, 3, 4, 5])
9192 >>> df
9193 A B C
9194 1 a f k
9195 2 b g l
9196 3 c h m
9197 4 d i n
9198 5 e j o
9200 >>> df.truncate(before=2, after=4)
9201 A B C
9202 2 b g l
9203 3 c h m
9204 4 d i n
9206 The columns of a DataFrame can be truncated.
9208 >>> df.truncate(before="A", after="B", axis="columns")
9209 A B
9210 1 a f
9211 2 b g
9212 3 c h
9213 4 d i
9214 5 e j
9216 For Series, only rows can be truncated.
9218 >>> df['A'].truncate(before=2, after=4)
9219 2 b
9220 3 c
9221 4 d
9222 Name: A, dtype: object
9224 The index values in ``truncate`` can be datetimes or string
9225 dates.
9227 >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')
9228 >>> df = pd.DataFrame(index=dates, data={'A': 1})
9229 >>> df.tail()
9230 A
9231 2016-01-31 23:59:56 1
9232 2016-01-31 23:59:57 1
9233 2016-01-31 23:59:58 1
9234 2016-01-31 23:59:59 1
9235 2016-02-01 00:00:00 1
9237 >>> df.truncate(before=pd.Timestamp('2016-01-05'),
9238 ... after=pd.Timestamp('2016-01-10')).tail()
9239 A
9240 2016-01-09 23:59:56 1
9241 2016-01-09 23:59:57 1
9242 2016-01-09 23:59:58 1
9243 2016-01-09 23:59:59 1
9244 2016-01-10 00:00:00 1
9246 Because the index is a DatetimeIndex containing only dates, we can
9247 specify `before` and `after` as strings. They will be coerced to
9248 Timestamps before truncation.
9250 >>> df.truncate('2016-01-05', '2016-01-10').tail()
9251 A
9252 2016-01-09 23:59:56 1
9253 2016-01-09 23:59:57 1
9254 2016-01-09 23:59:58 1
9255 2016-01-09 23:59:59 1
9256 2016-01-10 00:00:00 1
9258 Note that ``truncate`` assumes a 0 value for any unspecified time
9259 component (midnight). This differs from partial string slicing, which
9260 returns any partially matching dates.
9262 >>> df.loc['2016-01-05':'2016-01-10', :].tail()
9263 A
9264 2016-01-10 23:59:55 1
9265 2016-01-10 23:59:56 1
9266 2016-01-10 23:59:57 1
9267 2016-01-10 23:59:58 1
9268 2016-01-10 23:59:59 1
9269 """
9270 if axis is None:
9271 axis = self._stat_axis_number
9272 axis = self._get_axis_number(axis)
9273 ax = self._get_axis(axis)
9275 # GH 17935
9276 # Check that index is sorted
9277 if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
9278 raise ValueError("truncate requires a sorted index")
9280 # if we have a date index, convert to dates, otherwise
9281 # treat like a slice
9282 if ax.is_all_dates:
9283 from pandas.core.tools.datetimes import to_datetime
9285 before = to_datetime(before)
9286 after = to_datetime(after)
9288 if before is not None and after is not None:
9289 if before > after:
9290 raise ValueError(f"Truncate: {after} must be after {before}")
9292 slicer = [slice(None, None)] * self._AXIS_LEN
9293 slicer[axis] = slice(before, after)
9294 result = self.loc[tuple(slicer)]
9296 if isinstance(ax, MultiIndex):
9297 setattr(result, self._get_axis_name(axis), ax.truncate(before, after))
9299 if copy:
9300 result = result.copy()
9302 return result
9304 def tz_convert(
9305 self: FrameOrSeries, tz, axis=0, level=None, copy: bool_t = True
9306 ) -> FrameOrSeries:
9307 """
9308 Convert tz-aware axis to target time zone.
9310 Parameters
9311 ----------
9312 tz : str or tzinfo object
9313 axis : the axis to convert
9314 level : int, str, default None
9315 If axis is a MultiIndex, convert a specific level. Otherwise
9316 must be None.
9317 copy : bool, default True
9318 Also make a copy of the underlying data.
9320 Returns
9321 -------
9322 %(klass)s
9323 Object with time zone converted axis.
9325 Raises
9326 ------
9327 TypeError
9328 If the axis is tz-naive.
9329 """
9330 axis = self._get_axis_number(axis)
9331 ax = self._get_axis(axis)
9333 def _tz_convert(ax, tz):
9334 if not hasattr(ax, "tz_convert"):
9335 if len(ax) > 0:
9336 ax_name = self._get_axis_name(axis)
9337 raise TypeError(
9338 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
9339 )
9340 else:
9341 ax = DatetimeIndex([], tz=tz)
9342 else:
9343 ax = ax.tz_convert(tz)
9344 return ax
9346 # if a level is given it must be a MultiIndex level or
9347 # equivalent to the axis name
9348 if isinstance(ax, MultiIndex):
9349 level = ax._get_level_number(level)
9350 new_level = _tz_convert(ax.levels[level], tz)
9351 ax = ax.set_levels(new_level, level=level)
9352 else:
9353 if level not in (None, 0, ax.name):
9354 raise ValueError(f"The level {level} is not valid")
9355 ax = _tz_convert(ax, tz)
9357 result = self._constructor(self._data, copy=copy)
9358 result = result.set_axis(ax, axis=axis, inplace=False)
9359 return result.__finalize__(self)
9361 def tz_localize(
9362 self: FrameOrSeries,
9363 tz,
9364 axis=0,
9365 level=None,
9366 copy: bool_t = True,
9367 ambiguous="raise",
9368 nonexistent: str = "raise",
9369 ) -> FrameOrSeries:
9370 """
9371 Localize tz-naive index of a Series or DataFrame to target time zone.
9373 This operation localizes the Index. To localize the values in a
9374 timezone-naive Series, use :meth:`Series.dt.tz_localize`.
9376 Parameters
9377 ----------
9378 tz : str or tzinfo
9379 axis : the axis to localize
9380 level : int, str, default None
9381 If axis ia a MultiIndex, localize a specific level. Otherwise
9382 must be None.
9383 copy : bool, default True
9384 Also make a copy of the underlying data.
9385 ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
9386 When clocks moved backward due to DST, ambiguous times may arise.
9387 For example in Central European Time (UTC+01), when going from
9388 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
9389 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
9390 `ambiguous` parameter dictates how ambiguous times should be
9391 handled.
9393 - 'infer' will attempt to infer fall dst-transition hours based on
9394 order
9395 - bool-ndarray where True signifies a DST time, False designates
9396 a non-DST time (note that this flag is only applicable for
9397 ambiguous times)
9398 - 'NaT' will return NaT where there are ambiguous times
9399 - 'raise' will raise an AmbiguousTimeError if there are ambiguous
9400 times.
9401 nonexistent : str, default 'raise'
9402 A nonexistent time does not exist in a particular timezone
9403 where clocks moved forward due to DST. Valid values are:
9405 - 'shift_forward' will shift the nonexistent time forward to the
9406 closest existing time
9407 - 'shift_backward' will shift the nonexistent time backward to the
9408 closest existing time
9409 - 'NaT' will return NaT where there are nonexistent times
9410 - timedelta objects will shift nonexistent times by the timedelta
9411 - 'raise' will raise an NonExistentTimeError if there are
9412 nonexistent times.
9414 .. versionadded:: 0.24.0
9416 Returns
9417 -------
9418 Series or DataFrame
9419 Same type as the input.
9421 Raises
9422 ------
9423 TypeError
9424 If the TimeSeries is tz-aware and tz is not None.
9426 Examples
9427 --------
9429 Localize local times:
9431 >>> s = pd.Series([1],
9432 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']))
9433 >>> s.tz_localize('CET')
9434 2018-09-15 01:30:00+02:00 1
9435 dtype: int64
9437 Be careful with DST changes. When there is sequential data, pandas
9438 can infer the DST time:
9440 >>> s = pd.Series(range(7),
9441 ... index=pd.DatetimeIndex(['2018-10-28 01:30:00',
9442 ... '2018-10-28 02:00:00',
9443 ... '2018-10-28 02:30:00',
9444 ... '2018-10-28 02:00:00',
9445 ... '2018-10-28 02:30:00',
9446 ... '2018-10-28 03:00:00',
9447 ... '2018-10-28 03:30:00']))
9448 >>> s.tz_localize('CET', ambiguous='infer')
9449 2018-10-28 01:30:00+02:00 0
9450 2018-10-28 02:00:00+02:00 1
9451 2018-10-28 02:30:00+02:00 2
9452 2018-10-28 02:00:00+01:00 3
9453 2018-10-28 02:30:00+01:00 4
9454 2018-10-28 03:00:00+01:00 5
9455 2018-10-28 03:30:00+01:00 6
9456 dtype: int64
9458 In some cases, inferring the DST is impossible. In such cases, you can
9459 pass an ndarray to the ambiguous parameter to set the DST explicitly
9461 >>> s = pd.Series(range(3),
9462 ... index=pd.DatetimeIndex(['2018-10-28 01:20:00',
9463 ... '2018-10-28 02:36:00',
9464 ... '2018-10-28 03:46:00']))
9465 >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))
9466 2018-10-28 01:20:00+02:00 0
9467 2018-10-28 02:36:00+02:00 1
9468 2018-10-28 03:46:00+01:00 2
9469 dtype: int64
9471 If the DST transition causes nonexistent times, you can shift these
9472 dates forward or backwards with a timedelta object or `'shift_forward'`
9473 or `'shift_backwards'`.
9474 >>> s = pd.Series(range(2),
9475 ... index=pd.DatetimeIndex(['2015-03-29 02:30:00',
9476 ... '2015-03-29 03:30:00']))
9477 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')
9478 2015-03-29 03:00:00+02:00 0
9479 2015-03-29 03:30:00+02:00 1
9480 dtype: int64
9481 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')
9482 2015-03-29 01:59:59.999999999+01:00 0
9483 2015-03-29 03:30:00+02:00 1
9484 dtype: int64
9485 >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))
9486 2015-03-29 03:30:00+02:00 0
9487 2015-03-29 03:30:00+02:00 1
9488 dtype: int64
9489 """
9490 nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
9491 if nonexistent not in nonexistent_options and not isinstance(
9492 nonexistent, timedelta
9493 ):
9494 raise ValueError(
9495 "The nonexistent argument must be one of 'raise', "
9496 "'NaT', 'shift_forward', 'shift_backward' or "
9497 "a timedelta object"
9498 )
9500 axis = self._get_axis_number(axis)
9501 ax = self._get_axis(axis)
9503 def _tz_localize(ax, tz, ambiguous, nonexistent):
9504 if not hasattr(ax, "tz_localize"):
9505 if len(ax) > 0:
9506 ax_name = self._get_axis_name(axis)
9507 raise TypeError(
9508 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
9509 )
9510 else:
9511 ax = DatetimeIndex([], tz=tz)
9512 else:
9513 ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent)
9514 return ax
9516 # if a level is given it must be a MultiIndex level or
9517 # equivalent to the axis name
9518 if isinstance(ax, MultiIndex):
9519 level = ax._get_level_number(level)
9520 new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent)
9521 ax = ax.set_levels(new_level, level=level)
9522 else:
9523 if level not in (None, 0, ax.name):
9524 raise ValueError(f"The level {level} is not valid")
9525 ax = _tz_localize(ax, tz, ambiguous, nonexistent)
9527 result = self._constructor(self._data, copy=copy)
9528 result = result.set_axis(ax, axis=axis, inplace=False)
9529 return result.__finalize__(self)
9531 # ----------------------------------------------------------------------
9532 # Numeric Methods
9533 def abs(self: FrameOrSeries) -> FrameOrSeries:
9534 """
9535 Return a Series/DataFrame with absolute numeric value of each element.
9537 This function only applies to elements that are all numeric.
9539 Returns
9540 -------
9541 abs
9542 Series/DataFrame containing the absolute value of each element.
9544 See Also
9545 --------
9546 numpy.absolute : Calculate the absolute value element-wise.
9548 Notes
9549 -----
9550 For ``complex`` inputs, ``1.2 + 1j``, the absolute value is
9551 :math:`\\sqrt{ a^2 + b^2 }`.
9553 Examples
9554 --------
9555 Absolute numeric values in a Series.
9557 >>> s = pd.Series([-1.10, 2, -3.33, 4])
9558 >>> s.abs()
9559 0 1.10
9560 1 2.00
9561 2 3.33
9562 3 4.00
9563 dtype: float64
9565 Absolute numeric values in a Series with complex numbers.
9567 >>> s = pd.Series([1.2 + 1j])
9568 >>> s.abs()
9569 0 1.56205
9570 dtype: float64
9572 Absolute numeric values in a Series with a Timedelta element.
9574 >>> s = pd.Series([pd.Timedelta('1 days')])
9575 >>> s.abs()
9576 0 1 days
9577 dtype: timedelta64[ns]
9579 Select rows with data closest to certain value using argsort (from
9580 `StackOverflow <https://stackoverflow.com/a/17758115>`__).
9582 >>> df = pd.DataFrame({
9583 ... 'a': [4, 5, 6, 7],
9584 ... 'b': [10, 20, 30, 40],
9585 ... 'c': [100, 50, -30, -50]
9586 ... })
9587 >>> df
9588 a b c
9589 0 4 10 100
9590 1 5 20 50
9591 2 6 30 -30
9592 3 7 40 -50
9593 >>> df.loc[(df.c - 43).abs().argsort()]
9594 a b c
9595 1 5 20 50
9596 0 4 10 100
9597 2 6 30 -30
9598 3 7 40 -50
9599 """
9600 return np.abs(self)
9602 def describe(
9603 self: FrameOrSeries, percentiles=None, include=None, exclude=None
9604 ) -> FrameOrSeries:
9605 """
9606 Generate descriptive statistics.
9608 Descriptive statistics include those that summarize the central
9609 tendency, dispersion and shape of a
9610 dataset's distribution, excluding ``NaN`` values.
9612 Analyzes both numeric and object series, as well
9613 as ``DataFrame`` column sets of mixed data types. The output
9614 will vary depending on what is provided. Refer to the notes
9615 below for more detail.
9617 Parameters
9618 ----------
9619 percentiles : list-like of numbers, optional
9620 The percentiles to include in the output. All should
9621 fall between 0 and 1. The default is
9622 ``[.25, .5, .75]``, which returns the 25th, 50th, and
9623 75th percentiles.
9624 include : 'all', list-like of dtypes or None (default), optional
9625 A white list of data types to include in the result. Ignored
9626 for ``Series``. Here are the options:
9628 - 'all' : All columns of the input will be included in the output.
9629 - A list-like of dtypes : Limits the results to the
9630 provided data types.
9631 To limit the result to numeric types submit
9632 ``numpy.number``. To limit it instead to object columns submit
9633 the ``numpy.object`` data type. Strings
9634 can also be used in the style of
9635 ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
9636 select pandas categorical columns, use ``'category'``
9637 - None (default) : The result will include all numeric columns.
9638 exclude : list-like of dtypes or None (default), optional,
9639 A black list of data types to omit from the result. Ignored
9640 for ``Series``. Here are the options:
9642 - A list-like of dtypes : Excludes the provided data types
9643 from the result. To exclude numeric types submit
9644 ``numpy.number``. To exclude object columns submit the data
9645 type ``numpy.object``. Strings can also be used in the style of
9646 ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
9647 exclude pandas categorical columns, use ``'category'``
9648 - None (default) : The result will exclude nothing.
9650 Returns
9651 -------
9652 Series or DataFrame
9653 Summary statistics of the Series or Dataframe provided.
9655 See Also
9656 --------
9657 DataFrame.count: Count number of non-NA/null observations.
9658 DataFrame.max: Maximum of the values in the object.
9659 DataFrame.min: Minimum of the values in the object.
9660 DataFrame.mean: Mean of the values.
9661 DataFrame.std: Standard deviation of the observations.
9662 DataFrame.select_dtypes: Subset of a DataFrame including/excluding
9663 columns based on their dtype.
9665 Notes
9666 -----
9667 For numeric data, the result's index will include ``count``,
9668 ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
9669 upper percentiles. By default the lower percentile is ``25`` and the
9670 upper percentile is ``75``. The ``50`` percentile is the
9671 same as the median.
9673 For object data (e.g. strings or timestamps), the result's index
9674 will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``
9675 is the most common value. The ``freq`` is the most common value's
9676 frequency. Timestamps also include the ``first`` and ``last`` items.
9678 If multiple object values have the highest count, then the
9679 ``count`` and ``top`` results will be arbitrarily chosen from
9680 among those with the highest count.
9682 For mixed data types provided via a ``DataFrame``, the default is to
9683 return only an analysis of numeric columns. If the dataframe consists
9684 only of object and categorical data without any numeric columns, the
9685 default is to return an analysis of both the object and categorical
9686 columns. If ``include='all'`` is provided as an option, the result
9687 will include a union of attributes of each type.
9689 The `include` and `exclude` parameters can be used to limit
9690 which columns in a ``DataFrame`` are analyzed for the output.
9691 The parameters are ignored when analyzing a ``Series``.
9693 Examples
9694 --------
9695 Describing a numeric ``Series``.
9697 >>> s = pd.Series([1, 2, 3])
9698 >>> s.describe()
9699 count 3.0
9700 mean 2.0
9701 std 1.0
9702 min 1.0
9703 25% 1.5
9704 50% 2.0
9705 75% 2.5
9706 max 3.0
9707 dtype: float64
9709 Describing a categorical ``Series``.
9711 >>> s = pd.Series(['a', 'a', 'b', 'c'])
9712 >>> s.describe()
9713 count 4
9714 unique 3
9715 top a
9716 freq 2
9717 dtype: object
9719 Describing a timestamp ``Series``.
9721 >>> s = pd.Series([
9722 ... np.datetime64("2000-01-01"),
9723 ... np.datetime64("2010-01-01"),
9724 ... np.datetime64("2010-01-01")
9725 ... ])
9726 >>> s.describe()
9727 count 3
9728 unique 2
9729 top 2010-01-01 00:00:00
9730 freq 2
9731 first 2000-01-01 00:00:00
9732 last 2010-01-01 00:00:00
9733 dtype: object
9735 Describing a ``DataFrame``. By default only numeric fields
9736 are returned.
9738 >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),
9739 ... 'numeric': [1, 2, 3],
9740 ... 'object': ['a', 'b', 'c']
9741 ... })
9742 >>> df.describe()
9743 numeric
9744 count 3.0
9745 mean 2.0
9746 std 1.0
9747 min 1.0
9748 25% 1.5
9749 50% 2.0
9750 75% 2.5
9751 max 3.0
9753 Describing all columns of a ``DataFrame`` regardless of data type.
9755 >>> df.describe(include='all')
9756 categorical numeric object
9757 count 3 3.0 3
9758 unique 3 NaN 3
9759 top f NaN c
9760 freq 1 NaN 1
9761 mean NaN 2.0 NaN
9762 std NaN 1.0 NaN
9763 min NaN 1.0 NaN
9764 25% NaN 1.5 NaN
9765 50% NaN 2.0 NaN
9766 75% NaN 2.5 NaN
9767 max NaN 3.0 NaN
9769 Describing a column from a ``DataFrame`` by accessing it as
9770 an attribute.
9772 >>> df.numeric.describe()
9773 count 3.0
9774 mean 2.0
9775 std 1.0
9776 min 1.0
9777 25% 1.5
9778 50% 2.0
9779 75% 2.5
9780 max 3.0
9781 Name: numeric, dtype: float64
9783 Including only numeric columns in a ``DataFrame`` description.
9785 >>> df.describe(include=[np.number])
9786 numeric
9787 count 3.0
9788 mean 2.0
9789 std 1.0
9790 min 1.0
9791 25% 1.5
9792 50% 2.0
9793 75% 2.5
9794 max 3.0
9796 Including only string columns in a ``DataFrame`` description.
9798 >>> df.describe(include=[np.object])
9799 object
9800 count 3
9801 unique 3
9802 top c
9803 freq 1
9805 Including only categorical columns from a ``DataFrame`` description.
9807 >>> df.describe(include=['category'])
9808 categorical
9809 count 3
9810 unique 3
9811 top f
9812 freq 1
9814 Excluding numeric columns from a ``DataFrame`` description.
9816 >>> df.describe(exclude=[np.number])
9817 categorical object
9818 count 3 3
9819 unique 3 3
9820 top f c
9821 freq 1 1
9823 Excluding object columns from a ``DataFrame`` description.
9825 >>> df.describe(exclude=[np.object])
9826 categorical numeric
9827 count 3 3.0
9828 unique 3 NaN
9829 top f NaN
9830 freq 1 NaN
9831 mean NaN 2.0
9832 std NaN 1.0
9833 min NaN 1.0
9834 25% NaN 1.5
9835 50% NaN 2.0
9836 75% NaN 2.5
9837 max NaN 3.0
9838 """
9839 if self.ndim == 2 and self.columns.size == 0:
9840 raise ValueError("Cannot describe a DataFrame without columns")
9842 if percentiles is not None:
9843 # explicit conversion of `percentiles` to list
9844 percentiles = list(percentiles)
9846 # get them all to be in [0, 1]
9847 validate_percentile(percentiles)
9849 # median should always be included
9850 if 0.5 not in percentiles:
9851 percentiles.append(0.5)
9852 percentiles = np.asarray(percentiles)
9853 else:
9854 percentiles = np.array([0.25, 0.5, 0.75])
9856 # sort and check for duplicates
9857 unique_pcts = np.unique(percentiles)
9858 if len(unique_pcts) < len(percentiles):
9859 raise ValueError("percentiles cannot contain duplicates")
9860 percentiles = unique_pcts
9862 formatted_percentiles = format_percentiles(percentiles)
9864 def describe_numeric_1d(series):
9865 stat_index = (
9866 ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
9867 )
9868 d = (
9869 [series.count(), series.mean(), series.std(), series.min()]
9870 + series.quantile(percentiles).tolist()
9871 + [series.max()]
9872 )
9873 return pd.Series(d, index=stat_index, name=series.name)
9875 def describe_categorical_1d(data):
9876 names = ["count", "unique"]
9877 objcounts = data.value_counts()
9878 count_unique = len(objcounts[objcounts != 0])
9879 result = [data.count(), count_unique]
9880 dtype = None
9881 if result[1] > 0:
9882 top, freq = objcounts.index[0], objcounts.iloc[0]
9884 if is_datetime64_any_dtype(data):
9885 tz = data.dt.tz
9886 asint = data.dropna().values.view("i8")
9887 top = Timestamp(top)
9888 if top.tzinfo is not None and tz is not None:
9889 # Don't tz_localize(None) if key is already tz-aware
9890 top = top.tz_convert(tz)
9891 else:
9892 top = top.tz_localize(tz)
9893 names += ["top", "freq", "first", "last"]
9894 result += [
9895 top,
9896 freq,
9897 Timestamp(asint.min(), tz=tz),
9898 Timestamp(asint.max(), tz=tz),
9899 ]
9900 else:
9901 names += ["top", "freq"]
9902 result += [top, freq]
9904 # If the DataFrame is empty, set 'top' and 'freq' to None
9905 # to maintain output shape consistency
9906 else:
9907 names += ["top", "freq"]
9908 result += [np.nan, np.nan]
9909 dtype = "object"
9911 return pd.Series(result, index=names, name=data.name, dtype=dtype)
9913 def describe_1d(data):
9914 if is_bool_dtype(data):
9915 return describe_categorical_1d(data)
9916 elif is_numeric_dtype(data):
9917 return describe_numeric_1d(data)
9918 elif is_timedelta64_dtype(data):
9919 return describe_numeric_1d(data)
9920 else:
9921 return describe_categorical_1d(data)
9923 if self.ndim == 1:
9924 return describe_1d(self)
9925 elif (include is None) and (exclude is None):
9926 # when some numerics are found, keep only numerics
9927 data = self.select_dtypes(include=[np.number])
9928 if len(data.columns) == 0:
9929 data = self
9930 elif include == "all":
9931 if exclude is not None:
9932 msg = "exclude must be None when include is 'all'"
9933 raise ValueError(msg)
9934 data = self
9935 else:
9936 data = self.select_dtypes(include=include, exclude=exclude)
9938 ldesc = [describe_1d(s) for _, s in data.items()]
9939 # set a convenient order for rows
9940 names: List[Optional[Hashable]] = []
9941 ldesc_indexes = sorted((x.index for x in ldesc), key=len)
9942 for idxnames in ldesc_indexes:
9943 for name in idxnames:
9944 if name not in names:
9945 names.append(name)
9947 d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
9948 d.columns = data.columns.copy()
9949 return d
9951 _shared_docs[
9952 "pct_change"
9953 ] = """
9954 Percentage change between the current and a prior element.
9956 Computes the percentage change from the immediately previous row by
9957 default. This is useful in comparing the percentage of change in a time
9958 series of elements.
9960 Parameters
9961 ----------
9962 periods : int, default 1
9963 Periods to shift for forming percent change.
9964 fill_method : str, default 'pad'
9965 How to handle NAs before computing percent changes.
9966 limit : int, default None
9967 The number of consecutive NAs to fill before stopping.
9968 freq : DateOffset, timedelta, or str, optional
9969 Increment to use from time series API (e.g. 'M' or BDay()).
9970 **kwargs
9971 Additional keyword arguments are passed into
9972 `DataFrame.shift` or `Series.shift`.
9974 Returns
9975 -------
9976 chg : Series or DataFrame
9977 The same type as the calling object.
9979 See Also
9980 --------
9981 Series.diff : Compute the difference of two elements in a Series.
9982 DataFrame.diff : Compute the difference of two elements in a DataFrame.
9983 Series.shift : Shift the index by some number of periods.
9984 DataFrame.shift : Shift the index by some number of periods.
9986 Examples
9987 --------
9988 **Series**
9990 >>> s = pd.Series([90, 91, 85])
9991 >>> s
9992 0 90
9993 1 91
9994 2 85
9995 dtype: int64
9997 >>> s.pct_change()
9998 0 NaN
9999 1 0.011111
10000 2 -0.065934
10001 dtype: float64
10003 >>> s.pct_change(periods=2)
10004 0 NaN
10005 1 NaN
10006 2 -0.055556
10007 dtype: float64
10009 See the percentage change in a Series where filling NAs with last
10010 valid observation forward to next valid.
10012 >>> s = pd.Series([90, 91, None, 85])
10013 >>> s
10014 0 90.0
10015 1 91.0
10016 2 NaN
10017 3 85.0
10018 dtype: float64
10020 >>> s.pct_change(fill_method='ffill')
10021 0 NaN
10022 1 0.011111
10023 2 0.000000
10024 3 -0.065934
10025 dtype: float64
10027 **DataFrame**
10029 Percentage change in French franc, Deutsche Mark, and Italian lira from
10030 1980-01-01 to 1980-03-01.
10032 >>> df = pd.DataFrame({
10033 ... 'FR': [4.0405, 4.0963, 4.3149],
10034 ... 'GR': [1.7246, 1.7482, 1.8519],
10035 ... 'IT': [804.74, 810.01, 860.13]},
10036 ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
10037 >>> df
10038 FR GR IT
10039 1980-01-01 4.0405 1.7246 804.74
10040 1980-02-01 4.0963 1.7482 810.01
10041 1980-03-01 4.3149 1.8519 860.13
10043 >>> df.pct_change()
10044 FR GR IT
10045 1980-01-01 NaN NaN NaN
10046 1980-02-01 0.013810 0.013684 0.006549
10047 1980-03-01 0.053365 0.059318 0.061876
10049 Percentage of change in GOOG and APPL stock volume. Shows computing
10050 the percentage change between columns.
10052 >>> df = pd.DataFrame({
10053 ... '2016': [1769950, 30586265],
10054 ... '2015': [1500923, 40912316],
10055 ... '2014': [1371819, 41403351]},
10056 ... index=['GOOG', 'APPL'])
10057 >>> df
10058 2016 2015 2014
10059 GOOG 1769950 1500923 1371819
10060 APPL 30586265 40912316 41403351
10062 >>> df.pct_change(axis='columns')
10063 2016 2015 2014
10064 GOOG NaN -0.151997 -0.086016
10065 APPL NaN 0.337604 0.012002
10066 """
10068 @Appender(_shared_docs["pct_change"] % _shared_doc_kwargs)
10069 def pct_change(
10070 self: FrameOrSeries,
10071 periods=1,
10072 fill_method="pad",
10073 limit=None,
10074 freq=None,
10075 **kwargs,
10076 ) -> FrameOrSeries:
10077 # TODO: Not sure if above is correct - need someone to confirm.
10078 axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name))
10079 if fill_method is None:
10080 data = self
10081 else:
10082 _data = self.fillna(method=fill_method, axis=axis, limit=limit)
10083 assert _data is not None # needed for mypy
10084 data = _data
10086 rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1
10087 if freq is not None:
10088 # Shift method is implemented differently when freq is not None
10089 # We want to restore the original index
10090 rs = rs.loc[~rs.index.duplicated()]
10091 rs = rs.reindex_like(data)
10092 return rs
10094 def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs):
10095 if axis is None:
10096 raise ValueError("Must specify 'axis' when aggregating by level.")
10097 grouped = self.groupby(level=level, axis=axis, sort=False)
10098 if hasattr(grouped, name) and skipna:
10099 return getattr(grouped, name)(**kwargs)
10100 axis = self._get_axis_number(axis)
10101 method = getattr(type(self), name)
10102 applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs)
10103 return grouped.aggregate(applyf)
10105 @classmethod
10106 def _add_numeric_operations(cls):
10107 """
10108 Add the operations to the cls; evaluate the doc strings again
10109 """
10111 axis_descr, name, name2 = _doc_parms(cls)
10113 cls.any = _make_logical_function(
10114 cls,
10115 "any",
10116 name,
10117 name2,
10118 axis_descr,
10119 _any_desc,
10120 nanops.nanany,
10121 _any_see_also,
10122 _any_examples,
10123 empty_value=False,
10124 )
10125 cls.all = _make_logical_function(
10126 cls,
10127 "all",
10128 name,
10129 name2,
10130 axis_descr,
10131 _all_desc,
10132 nanops.nanall,
10133 _all_see_also,
10134 _all_examples,
10135 empty_value=True,
10136 )
10138 @Substitution(
10139 desc="Return the mean absolute deviation of the values "
10140 "for the requested axis.",
10141 name1=name,
10142 name2=name2,
10143 axis_descr=axis_descr,
10144 min_count="",
10145 see_also="",
10146 examples="",
10147 )
10148 @Appender(_num_doc)
10149 def mad(self, axis=None, skipna=None, level=None):
10150 if skipna is None:
10151 skipna = True
10152 if axis is None:
10153 axis = self._stat_axis_number
10154 if level is not None:
10155 return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna)
10157 data = self._get_numeric_data()
10158 if axis == 0:
10159 demeaned = data - data.mean(axis=0)
10160 else:
10161 demeaned = data.sub(data.mean(axis=1), axis=0)
10162 return np.abs(demeaned).mean(axis=axis, skipna=skipna)
10164 cls.mad = mad
10166 cls.sem = _make_stat_function_ddof(
10167 cls,
10168 "sem",
10169 name,
10170 name2,
10171 axis_descr,
10172 "Return unbiased standard error of the mean over requested "
10173 "axis.\n\nNormalized by N-1 by default. This can be changed "
10174 "using the ddof argument",
10175 nanops.nansem,
10176 )
10177 cls.var = _make_stat_function_ddof(
10178 cls,
10179 "var",
10180 name,
10181 name2,
10182 axis_descr,
10183 "Return unbiased variance over requested axis.\n\nNormalized by "
10184 "N-1 by default. This can be changed using the ddof argument",
10185 nanops.nanvar,
10186 )
10187 cls.std = _make_stat_function_ddof(
10188 cls,
10189 "std",
10190 name,
10191 name2,
10192 axis_descr,
10193 "Return sample standard deviation over requested axis."
10194 "\n\nNormalized by N-1 by default. This can be changed using the "
10195 "ddof argument",
10196 nanops.nanstd,
10197 )
10199 cls.cummin = _make_cum_function(
10200 cls,
10201 "cummin",
10202 name,
10203 name2,
10204 axis_descr,
10205 "minimum",
10206 np.minimum.accumulate,
10207 "min",
10208 np.inf,
10209 np.nan,
10210 _cummin_examples,
10211 )
10212 cls.cumsum = _make_cum_function(
10213 cls,
10214 "cumsum",
10215 name,
10216 name2,
10217 axis_descr,
10218 "sum",
10219 np.cumsum,
10220 "sum",
10221 0.0,
10222 np.nan,
10223 _cumsum_examples,
10224 )
10225 cls.cumprod = _make_cum_function(
10226 cls,
10227 "cumprod",
10228 name,
10229 name2,
10230 axis_descr,
10231 "product",
10232 np.cumprod,
10233 "prod",
10234 1.0,
10235 np.nan,
10236 _cumprod_examples,
10237 )
10238 cls.cummax = _make_cum_function(
10239 cls,
10240 "cummax",
10241 name,
10242 name2,
10243 axis_descr,
10244 "maximum",
10245 np.maximum.accumulate,
10246 "max",
10247 -np.inf,
10248 np.nan,
10249 _cummax_examples,
10250 )
10252 cls.sum = _make_min_count_stat_function(
10253 cls,
10254 "sum",
10255 name,
10256 name2,
10257 axis_descr,
10258 """Return the sum of the values for the requested axis.\n
10259 This is equivalent to the method ``numpy.sum``.""",
10260 nanops.nansum,
10261 _stat_func_see_also,
10262 _sum_examples,
10263 )
10264 cls.mean = _make_stat_function(
10265 cls,
10266 "mean",
10267 name,
10268 name2,
10269 axis_descr,
10270 "Return the mean of the values for the requested axis.",
10271 nanops.nanmean,
10272 )
10273 cls.skew = _make_stat_function(
10274 cls,
10275 "skew",
10276 name,
10277 name2,
10278 axis_descr,
10279 "Return unbiased skew over requested axis.\n\nNormalized by N-1.",
10280 nanops.nanskew,
10281 )
10282 cls.kurt = _make_stat_function(
10283 cls,
10284 "kurt",
10285 name,
10286 name2,
10287 axis_descr,
10288 "Return unbiased kurtosis over requested axis.\n\n"
10289 "Kurtosis obtained using Fisher's definition of\n"
10290 "kurtosis (kurtosis of normal == 0.0). Normalized "
10291 "by N-1.",
10292 nanops.nankurt,
10293 )
10294 cls.kurtosis = cls.kurt
10295 cls.prod = _make_min_count_stat_function(
10296 cls,
10297 "prod",
10298 name,
10299 name2,
10300 axis_descr,
10301 "Return the product of the values for the requested axis.",
10302 nanops.nanprod,
10303 examples=_prod_examples,
10304 )
10305 cls.product = cls.prod
10306 cls.median = _make_stat_function(
10307 cls,
10308 "median",
10309 name,
10310 name2,
10311 axis_descr,
10312 "Return the median of the values for the requested axis.",
10313 nanops.nanmedian,
10314 )
10315 cls.max = _make_stat_function(
10316 cls,
10317 "max",
10318 name,
10319 name2,
10320 axis_descr,
10321 """Return the maximum of the values for the requested axis.\n
10322 If you want the *index* of the maximum, use ``idxmax``. This is
10323 the equivalent of the ``numpy.ndarray`` method ``argmax``.""",
10324 nanops.nanmax,
10325 _stat_func_see_also,
10326 _max_examples,
10327 )
10328 cls.min = _make_stat_function(
10329 cls,
10330 "min",
10331 name,
10332 name2,
10333 axis_descr,
10334 """Return the minimum of the values for the requested axis.\n
10335 If you want the *index* of the minimum, use ``idxmin``. This is
10336 the equivalent of the ``numpy.ndarray`` method ``argmin``.""",
10337 nanops.nanmin,
10338 _stat_func_see_also,
10339 _min_examples,
10340 )
10342 @classmethod
10343 def _add_series_or_dataframe_operations(cls):
10344 """
10345 Add the series or dataframe only operations to the cls; evaluate
10346 the doc strings again.
10347 """
10349 from pandas.core.window import EWM, Expanding, Rolling, Window
10351 @Appender(Rolling.__doc__)
10352 def rolling(
10353 self,
10354 window,
10355 min_periods=None,
10356 center=False,
10357 win_type=None,
10358 on=None,
10359 axis=0,
10360 closed=None,
10361 ):
10362 axis = self._get_axis_number(axis)
10364 if win_type is not None:
10365 return Window(
10366 self,
10367 window=window,
10368 min_periods=min_periods,
10369 center=center,
10370 win_type=win_type,
10371 on=on,
10372 axis=axis,
10373 closed=closed,
10374 )
10376 return Rolling(
10377 self,
10378 window=window,
10379 min_periods=min_periods,
10380 center=center,
10381 win_type=win_type,
10382 on=on,
10383 axis=axis,
10384 closed=closed,
10385 )
10387 cls.rolling = rolling
10389 @Appender(Expanding.__doc__)
10390 def expanding(self, min_periods=1, center=False, axis=0):
10391 axis = self._get_axis_number(axis)
10392 return Expanding(self, min_periods=min_periods, center=center, axis=axis)
10394 cls.expanding = expanding
10396 @Appender(EWM.__doc__)
10397 def ewm(
10398 self,
10399 com=None,
10400 span=None,
10401 halflife=None,
10402 alpha=None,
10403 min_periods=0,
10404 adjust=True,
10405 ignore_na=False,
10406 axis=0,
10407 ):
10408 axis = self._get_axis_number(axis)
10409 return EWM(
10410 self,
10411 com=com,
10412 span=span,
10413 halflife=halflife,
10414 alpha=alpha,
10415 min_periods=min_periods,
10416 adjust=adjust,
10417 ignore_na=ignore_na,
10418 axis=axis,
10419 )
10421 cls.ewm = ewm
10423 @Appender(_shared_docs["transform"] % dict(axis="", **_shared_doc_kwargs))
10424 def transform(self, func, *args, **kwargs):
10425 result = self.agg(func, *args, **kwargs)
10426 if is_scalar(result) or len(result) != len(self):
10427 raise ValueError("transforms cannot produce aggregated results")
10429 return result
10431 # ----------------------------------------------------------------------
10432 # Misc methods
10434 _shared_docs[
10435 "valid_index"
10436 ] = """
10437 Return index for %(position)s non-NA/null value.
10439 Returns
10440 -------
10441 scalar : type of index
10443 Notes
10444 -----
10445 If all elements are non-NA/null, returns None.
10446 Also returns None for empty %(klass)s.
10447 """
10449 def _find_valid_index(self, how: str):
10450 """
10451 Retrieves the index of the first valid value.
10453 Parameters
10454 ----------
10455 how : {'first', 'last'}
10456 Use this parameter to change between the first or last valid index.
10458 Returns
10459 -------
10460 idx_first_valid : type of index
10461 """
10463 idxpos = find_valid_index(self._values, how)
10464 if idxpos is None:
10465 return None
10466 return self.index[idxpos]
10468 @Appender(
10469 _shared_docs["valid_index"] % {"position": "first", "klass": "Series/DataFrame"}
10470 )
10471 def first_valid_index(self):
10472 return self._find_valid_index("first")
10474 @Appender(
10475 _shared_docs["valid_index"] % {"position": "last", "klass": "Series/DataFrame"}
10476 )
10477 def last_valid_index(self):
10478 return self._find_valid_index("last")
10481def _doc_parms(cls):
10482 """Return a tuple of the doc parms."""
10483 axis_descr = (
10484 f"{{{', '.join(f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS))}}}"
10485 )
10486 name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar"
10487 name2 = cls.__name__
10488 return axis_descr, name, name2
10491_num_doc = """
10492%(desc)s
10494Parameters
10495----------
10496axis : %(axis_descr)s
10497 Axis for the function to be applied on.
10498skipna : bool, default True
10499 Exclude NA/null values when computing the result.
10500level : int or level name, default None
10501 If the axis is a MultiIndex (hierarchical), count along a
10502 particular level, collapsing into a %(name1)s.
10503numeric_only : bool, default None
10504 Include only float, int, boolean columns. If None, will attempt to use
10505 everything, then use only numeric data. Not implemented for Series.
10506%(min_count)s\
10507**kwargs
10508 Additional keyword arguments to be passed to the function.
10510Returns
10511-------
10512%(name1)s or %(name2)s (if level specified)\
10513%(see_also)s\
10514%(examples)s
10515"""
10517_num_ddof_doc = """
10518%(desc)s
10520Parameters
10521----------
10522axis : %(axis_descr)s
10523skipna : bool, default True
10524 Exclude NA/null values. If an entire row/column is NA, the result
10525 will be NA.
10526level : int or level name, default None
10527 If the axis is a MultiIndex (hierarchical), count along a
10528 particular level, collapsing into a %(name1)s.
10529ddof : int, default 1
10530 Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
10531 where N represents the number of elements.
10532numeric_only : bool, default None
10533 Include only float, int, boolean columns. If None, will attempt to use
10534 everything, then use only numeric data. Not implemented for Series.
10536Returns
10537-------
10538%(name1)s or %(name2)s (if level specified)\n"""
10540_bool_doc = """
10541%(desc)s
10543Parameters
10544----------
10545axis : {0 or 'index', 1 or 'columns', None}, default 0
10546 Indicate which axis or axes should be reduced.
10548 * 0 / 'index' : reduce the index, return a Series whose index is the
10549 original column labels.
10550 * 1 / 'columns' : reduce the columns, return a Series whose index is the
10551 original index.
10552 * None : reduce all axes, return a scalar.
10554bool_only : bool, default None
10555 Include only boolean columns. If None, will attempt to use everything,
10556 then use only boolean data. Not implemented for Series.
10557skipna : bool, default True
10558 Exclude NA/null values. If the entire row/column is NA and skipna is
10559 True, then the result will be %(empty_value)s, as for an empty row/column.
10560 If skipna is False, then NA are treated as True, because these are not
10561 equal to zero.
10562level : int or level name, default None
10563 If the axis is a MultiIndex (hierarchical), count along a
10564 particular level, collapsing into a %(name1)s.
10565**kwargs : any, default None
10566 Additional keywords have no effect but might be accepted for
10567 compatibility with NumPy.
10569Returns
10570-------
10571%(name1)s or %(name2)s
10572 If level is specified, then, %(name2)s is returned; otherwise, %(name1)s
10573 is returned.
10575%(see_also)s
10576%(examples)s"""
10578_all_desc = """\
10579Return whether all elements are True, potentially over an axis.
10581Returns True unless there at least one element within a series or
10582along a Dataframe axis that is False or equivalent (e.g. zero or
10583empty)."""
10585_all_examples = """\
10586Examples
10587--------
10588**Series**
10590>>> pd.Series([True, True]).all()
10591True
10592>>> pd.Series([True, False]).all()
10593False
10594>>> pd.Series([]).all()
10595True
10596>>> pd.Series([np.nan]).all()
10597True
10598>>> pd.Series([np.nan]).all(skipna=False)
10599True
10601**DataFrames**
10603Create a dataframe from a dictionary.
10605>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
10606>>> df
10607 col1 col2
106080 True True
106091 True False
10611Default behaviour checks if column-wise values all return True.
10613>>> df.all()
10614col1 True
10615col2 False
10616dtype: bool
10618Specify ``axis='columns'`` to check if row-wise values all return True.
10620>>> df.all(axis='columns')
106210 True
106221 False
10623dtype: bool
10625Or ``axis=None`` for whether every value is True.
10627>>> df.all(axis=None)
10628False
10629"""
10631_all_see_also = """\
10632See Also
10633--------
10634Series.all : Return True if all elements are True.
10635DataFrame.any : Return True if one (or more) elements are True.
10636"""
10638_cnum_doc = """
10639Return cumulative %(desc)s over a DataFrame or Series axis.
10641Returns a DataFrame or Series of the same size containing the cumulative
10642%(desc)s.
10644Parameters
10645----------
10646axis : {0 or 'index', 1 or 'columns'}, default 0
10647 The index or the name of the axis. 0 is equivalent to None or 'index'.
10648skipna : bool, default True
10649 Exclude NA/null values. If an entire row/column is NA, the result
10650 will be NA.
10651*args, **kwargs :
10652 Additional keywords have no effect but might be accepted for
10653 compatibility with NumPy.
10655Returns
10656-------
10657%(name1)s or %(name2)s
10659See Also
10660--------
10661core.window.Expanding.%(accum_func_name)s : Similar functionality
10662 but ignores ``NaN`` values.
10663%(name2)s.%(accum_func_name)s : Return the %(desc)s over
10664 %(name2)s axis.
10665%(name2)s.cummax : Return cumulative maximum over %(name2)s axis.
10666%(name2)s.cummin : Return cumulative minimum over %(name2)s axis.
10667%(name2)s.cumsum : Return cumulative sum over %(name2)s axis.
10668%(name2)s.cumprod : Return cumulative product over %(name2)s axis.
10670%(examples)s"""
10672_cummin_examples = """\
10673Examples
10674--------
10675**Series**
10677>>> s = pd.Series([2, np.nan, 5, -1, 0])
10678>>> s
106790 2.0
106801 NaN
106812 5.0
106823 -1.0
106834 0.0
10684dtype: float64
10686By default, NA values are ignored.
10688>>> s.cummin()
106890 2.0
106901 NaN
106912 2.0
106923 -1.0
106934 -1.0
10694dtype: float64
10696To include NA values in the operation, use ``skipna=False``
10698>>> s.cummin(skipna=False)
106990 2.0
107001 NaN
107012 NaN
107023 NaN
107034 NaN
10704dtype: float64
10706**DataFrame**
10708>>> df = pd.DataFrame([[2.0, 1.0],
10709... [3.0, np.nan],
10710... [1.0, 0.0]],
10711... columns=list('AB'))
10712>>> df
10713 A B
107140 2.0 1.0
107151 3.0 NaN
107162 1.0 0.0
10718By default, iterates over rows and finds the minimum
10719in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
10721>>> df.cummin()
10722 A B
107230 2.0 1.0
107241 2.0 NaN
107252 1.0 0.0
10727To iterate over columns and find the minimum in each row,
10728use ``axis=1``
10730>>> df.cummin(axis=1)
10731 A B
107320 2.0 1.0
107331 3.0 NaN
107342 1.0 0.0
10735"""
10737_cumsum_examples = """\
10738Examples
10739--------
10740**Series**
10742>>> s = pd.Series([2, np.nan, 5, -1, 0])
10743>>> s
107440 2.0
107451 NaN
107462 5.0
107473 -1.0
107484 0.0
10749dtype: float64
10751By default, NA values are ignored.
10753>>> s.cumsum()
107540 2.0
107551 NaN
107562 7.0
107573 6.0
107584 6.0
10759dtype: float64
10761To include NA values in the operation, use ``skipna=False``
10763>>> s.cumsum(skipna=False)
107640 2.0
107651 NaN
107662 NaN
107673 NaN
107684 NaN
10769dtype: float64
10771**DataFrame**
10773>>> df = pd.DataFrame([[2.0, 1.0],
10774... [3.0, np.nan],
10775... [1.0, 0.0]],
10776... columns=list('AB'))
10777>>> df
10778 A B
107790 2.0 1.0
107801 3.0 NaN
107812 1.0 0.0
10783By default, iterates over rows and finds the sum
10784in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
10786>>> df.cumsum()
10787 A B
107880 2.0 1.0
107891 5.0 NaN
107902 6.0 1.0
10792To iterate over columns and find the sum in each row,
10793use ``axis=1``
10795>>> df.cumsum(axis=1)
10796 A B
107970 2.0 3.0
107981 3.0 NaN
107992 1.0 1.0
10800"""
10802_cumprod_examples = """\
10803Examples
10804--------
10805**Series**
10807>>> s = pd.Series([2, np.nan, 5, -1, 0])
10808>>> s
108090 2.0
108101 NaN
108112 5.0
108123 -1.0
108134 0.0
10814dtype: float64
10816By default, NA values are ignored.
10818>>> s.cumprod()
108190 2.0
108201 NaN
108212 10.0
108223 -10.0
108234 -0.0
10824dtype: float64
10826To include NA values in the operation, use ``skipna=False``
10828>>> s.cumprod(skipna=False)
108290 2.0
108301 NaN
108312 NaN
108323 NaN
108334 NaN
10834dtype: float64
10836**DataFrame**
10838>>> df = pd.DataFrame([[2.0, 1.0],
10839... [3.0, np.nan],
10840... [1.0, 0.0]],
10841... columns=list('AB'))
10842>>> df
10843 A B
108440 2.0 1.0
108451 3.0 NaN
108462 1.0 0.0
10848By default, iterates over rows and finds the product
10849in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
10851>>> df.cumprod()
10852 A B
108530 2.0 1.0
108541 6.0 NaN
108552 6.0 0.0
10857To iterate over columns and find the product in each row,
10858use ``axis=1``
10860>>> df.cumprod(axis=1)
10861 A B
108620 2.0 2.0
108631 3.0 NaN
108642 1.0 0.0
10865"""
10867_cummax_examples = """\
10868Examples
10869--------
10870**Series**
10872>>> s = pd.Series([2, np.nan, 5, -1, 0])
10873>>> s
108740 2.0
108751 NaN
108762 5.0
108773 -1.0
108784 0.0
10879dtype: float64
10881By default, NA values are ignored.
10883>>> s.cummax()
108840 2.0
108851 NaN
108862 5.0
108873 5.0
108884 5.0
10889dtype: float64
10891To include NA values in the operation, use ``skipna=False``
10893>>> s.cummax(skipna=False)
108940 2.0
108951 NaN
108962 NaN
108973 NaN
108984 NaN
10899dtype: float64
10901**DataFrame**
10903>>> df = pd.DataFrame([[2.0, 1.0],
10904... [3.0, np.nan],
10905... [1.0, 0.0]],
10906... columns=list('AB'))
10907>>> df
10908 A B
109090 2.0 1.0
109101 3.0 NaN
109112 1.0 0.0
10913By default, iterates over rows and finds the maximum
10914in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
10916>>> df.cummax()
10917 A B
109180 2.0 1.0
109191 3.0 NaN
109202 3.0 1.0
10922To iterate over columns and find the maximum in each row,
10923use ``axis=1``
10925>>> df.cummax(axis=1)
10926 A B
109270 2.0 2.0
109281 3.0 NaN
109292 1.0 1.0
10930"""
10932_any_see_also = """\
10933See Also
10934--------
10935numpy.any : Numpy version of this method.
10936Series.any : Return whether any element is True.
10937Series.all : Return whether all elements are True.
10938DataFrame.any : Return whether any element is True over requested axis.
10939DataFrame.all : Return whether all elements are True over requested axis.
10940"""
10942_any_desc = """\
10943Return whether any element is True, potentially over an axis.
10945Returns False unless there at least one element within a series or
10946along a Dataframe axis that is True or equivalent (e.g. non-zero or
10947non-empty)."""
10949_any_examples = """\
10950Examples
10951--------
10952**Series**
10954For Series input, the output is a scalar indicating whether any element
10955is True.
10957>>> pd.Series([False, False]).any()
10958False
10959>>> pd.Series([True, False]).any()
10960True
10961>>> pd.Series([]).any()
10962False
10963>>> pd.Series([np.nan]).any()
10964False
10965>>> pd.Series([np.nan]).any(skipna=False)
10966True
10968**DataFrame**
10970Whether each column contains at least one True element (the default).
10972>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
10973>>> df
10974 A B C
109750 1 0 0
109761 2 2 0
10978>>> df.any()
10979A True
10980B True
10981C False
10982dtype: bool
10984Aggregating over the columns.
10986>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
10987>>> df
10988 A B
109890 True 1
109901 False 2
10992>>> df.any(axis='columns')
109930 True
109941 True
10995dtype: bool
10997>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
10998>>> df
10999 A B
110000 True 1
110011 False 0
11003>>> df.any(axis='columns')
110040 True
110051 False
11006dtype: bool
11008Aggregating over the entire DataFrame with ``axis=None``.
11010>>> df.any(axis=None)
11011True
11013`any` for an empty DataFrame is an empty Series.
11015>>> pd.DataFrame([]).any()
11016Series([], dtype: bool)
11017"""
11019_shared_docs[
11020 "stat_func_example"
11021] = """
11023Examples
11024--------
11025>>> idx = pd.MultiIndex.from_arrays([
11026... ['warm', 'warm', 'cold', 'cold'],
11027... ['dog', 'falcon', 'fish', 'spider']],
11028... names=['blooded', 'animal'])
11029>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
11030>>> s
11031blooded animal
11032warm dog 4
11033 falcon 2
11034cold fish 0
11035 spider 8
11036Name: legs, dtype: int64
11038>>> s.{stat_func}()
11039{default_output}
11041{verb} using level names, as well as indices.
11043>>> s.{stat_func}(level='blooded')
11044blooded
11045warm {level_output_0}
11046cold {level_output_1}
11047Name: legs, dtype: int64
11049>>> s.{stat_func}(level=0)
11050blooded
11051warm {level_output_0}
11052cold {level_output_1}
11053Name: legs, dtype: int64"""
11055_sum_examples = _shared_docs["stat_func_example"].format(
11056 stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8
11059_sum_examples += """
11061By default, the sum of an empty or all-NA Series is ``0``.
11063>>> pd.Series([]).sum() # min_count=0 is the default
110640.0
11066This can be controlled with the ``min_count`` parameter. For example, if
11067you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
11069>>> pd.Series([]).sum(min_count=1)
11070nan
11072Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
11073empty series identically.
11075>>> pd.Series([np.nan]).sum()
110760.0
11078>>> pd.Series([np.nan]).sum(min_count=1)
11079nan"""
11081_max_examples = _shared_docs["stat_func_example"].format(
11082 stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8
11085_min_examples = _shared_docs["stat_func_example"].format(
11086 stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0
11089_stat_func_see_also = """
11091See Also
11092--------
11093Series.sum : Return the sum.
11094Series.min : Return the minimum.
11095Series.max : Return the maximum.
11096Series.idxmin : Return the index of the minimum.
11097Series.idxmax : Return the index of the maximum.
11098DataFrame.sum : Return the sum over the requested axis.
11099DataFrame.min : Return the minimum over the requested axis.
11100DataFrame.max : Return the maximum over the requested axis.
11101DataFrame.idxmin : Return the index of the minimum over the requested axis.
11102DataFrame.idxmax : Return the index of the maximum over the requested axis."""
11104_prod_examples = """
11106Examples
11107--------
11108By default, the product of an empty or all-NA Series is ``1``
11110>>> pd.Series([]).prod()
111111.0
11113This can be controlled with the ``min_count`` parameter
11115>>> pd.Series([]).prod(min_count=1)
11116nan
11118Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
11119empty series identically.
11121>>> pd.Series([np.nan]).prod()
111221.0
11124>>> pd.Series([np.nan]).prod(min_count=1)
11125nan"""
11127_min_count_stub = """\
11128min_count : int, default 0
11129 The required number of valid values to perform the operation. If fewer than
11130 ``min_count`` non-NA values are present the result will be NA.
11132 .. versionadded:: 0.22.0
11134 Added with the default being 0. This means the sum of an all-NA
11135 or empty Series is 0, and the product of an all-NA or empty
11136 Series is 1.
11137"""
11140def _make_min_count_stat_function(
11141 cls, name, name1, name2, axis_descr, desc, f, see_also: str = "", examples: str = ""
11142):
11143 @Substitution(
11144 desc=desc,
11145 name1=name1,
11146 name2=name2,
11147 axis_descr=axis_descr,
11148 min_count=_min_count_stub,
11149 see_also=see_also,
11150 examples=examples,
11151 )
11152 @Appender(_num_doc)
11153 def stat_func(
11154 self,
11155 axis=None,
11156 skipna=None,
11157 level=None,
11158 numeric_only=None,
11159 min_count=0,
11160 **kwargs,
11161 ):
11162 if name == "sum":
11163 nv.validate_sum(tuple(), kwargs)
11164 elif name == "prod":
11165 nv.validate_prod(tuple(), kwargs)
11166 else:
11167 nv.validate_stat_func(tuple(), kwargs, fname=name)
11168 if skipna is None:
11169 skipna = True
11170 if axis is None:
11171 axis = self._stat_axis_number
11172 if level is not None:
11173 return self._agg_by_level(
11174 name, axis=axis, level=level, skipna=skipna, min_count=min_count
11175 )
11176 return self._reduce(
11177 f,
11178 name,
11179 axis=axis,
11180 skipna=skipna,
11181 numeric_only=numeric_only,
11182 min_count=min_count,
11183 )
11185 return set_function_name(stat_func, name, cls)
11188def _make_stat_function(
11189 cls, name, name1, name2, axis_descr, desc, f, see_also: str = "", examples: str = ""
11190):
11191 @Substitution(
11192 desc=desc,
11193 name1=name1,
11194 name2=name2,
11195 axis_descr=axis_descr,
11196 min_count="",
11197 see_also=see_also,
11198 examples=examples,
11199 )
11200 @Appender(_num_doc)
11201 def stat_func(
11202 self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
11203 ):
11204 if name == "median":
11205 nv.validate_median(tuple(), kwargs)
11206 else:
11207 nv.validate_stat_func(tuple(), kwargs, fname=name)
11208 if skipna is None:
11209 skipna = True
11210 if axis is None:
11211 axis = self._stat_axis_number
11212 if level is not None:
11213 return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
11214 return self._reduce(
11215 f, name, axis=axis, skipna=skipna, numeric_only=numeric_only
11216 )
11218 return set_function_name(stat_func, name, cls)
11221def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f):
11222 @Substitution(desc=desc, name1=name1, name2=name2, axis_descr=axis_descr)
11223 @Appender(_num_ddof_doc)
11224 def stat_func(
11225 self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
11226 ):
11227 nv.validate_stat_ddof_func(tuple(), kwargs, fname=name)
11228 if skipna is None:
11229 skipna = True
11230 if axis is None:
11231 axis = self._stat_axis_number
11232 if level is not None:
11233 return self._agg_by_level(
11234 name, axis=axis, level=level, skipna=skipna, ddof=ddof
11235 )
11236 return self._reduce(
11237 f, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof
11238 )
11240 return set_function_name(stat_func, name, cls)
11243def _make_cum_function(
11244 cls,
11245 name,
11246 name1,
11247 name2,
11248 axis_descr,
11249 desc,
11250 accum_func,
11251 accum_func_name,
11252 mask_a,
11253 mask_b,
11254 examples,
11255):
11256 @Substitution(
11257 desc=desc,
11258 name1=name1,
11259 name2=name2,
11260 axis_descr=axis_descr,
11261 accum_func_name=accum_func_name,
11262 examples=examples,
11263 )
11264 @Appender(_cnum_doc)
11265 def cum_func(self, axis=None, skipna=True, *args, **kwargs):
11266 skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)
11267 if axis is None:
11268 axis = self._stat_axis_number
11269 else:
11270 axis = self._get_axis_number(axis)
11272 if axis == 1:
11273 return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T
11275 def na_accum_func(blk_values):
11276 # We will be applying this function to block values
11277 if blk_values.dtype.kind in ["m", "M"]:
11278 # GH#30460, GH#29058
11279 # numpy 1.18 started sorting NaTs at the end instead of beginning,
11280 # so we need to work around to maintain backwards-consistency.
11281 orig_dtype = blk_values.dtype
11283 # We need to define mask before masking NaTs
11284 mask = isna(blk_values)
11286 if accum_func == np.minimum.accumulate:
11287 # Note: the accum_func comparison fails as an "is" comparison
11288 y = blk_values.view("i8")
11289 y[mask] = np.iinfo(np.int64).max
11290 changed = True
11291 else:
11292 y = blk_values
11293 changed = False
11295 result = accum_func(y.view("i8"), axis)
11296 if skipna:
11297 np.putmask(result, mask, iNaT)
11298 elif accum_func == np.minimum.accumulate:
11299 # Restore NaTs that we masked previously
11300 nz = (~np.asarray(mask)).nonzero()[0]
11301 if len(nz):
11302 # everything up to the first non-na entry stays NaT
11303 result[: nz[0]] = iNaT
11305 if changed:
11306 # restore NaT elements
11307 y[mask] = iNaT # TODO: could try/finally for this?
11309 if isinstance(blk_values, np.ndarray):
11310 result = result.view(orig_dtype)
11311 else:
11312 # DatetimeArray
11313 result = type(blk_values)._from_sequence(result, dtype=orig_dtype)
11315 elif skipna and not issubclass(
11316 blk_values.dtype.type, (np.integer, np.bool_)
11317 ):
11318 vals = blk_values.copy().T
11319 mask = isna(vals)
11320 np.putmask(vals, mask, mask_a)
11321 result = accum_func(vals, axis)
11322 np.putmask(result, mask, mask_b)
11323 else:
11324 result = accum_func(blk_values.T, axis)
11326 # transpose back for ndarray, not for EA
11327 return result.T if hasattr(result, "T") else result
11329 result = self._data.apply(na_accum_func)
11331 d = self._construct_axes_dict()
11332 d["copy"] = False
11333 return self._constructor(result, **d).__finalize__(self)
11335 return set_function_name(cum_func, name, cls)
11338def _make_logical_function(
11339 cls, name, name1, name2, axis_descr, desc, f, see_also, examples, empty_value
11340):
11341 @Substitution(
11342 desc=desc,
11343 name1=name1,
11344 name2=name2,
11345 axis_descr=axis_descr,
11346 see_also=see_also,
11347 examples=examples,
11348 empty_value=empty_value,
11349 )
11350 @Appender(_bool_doc)
11351 def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
11352 nv.validate_logical_func(tuple(), kwargs, fname=name)
11353 if level is not None:
11354 if bool_only is not None:
11355 raise NotImplementedError(
11356 "Option bool_only is not implemented with option level."
11357 )
11358 return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
11359 return self._reduce(
11360 f,
11361 name,
11362 axis=axis,
11363 skipna=skipna,
11364 numeric_only=bool_only,
11365 filter_type="bool",
11366 )
11368 return set_function_name(logical_func, name, cls)