Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/construction.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Constructor functions intended to be shared by pd.array, Series.__init__,
3and Index.__new__.
5These should not depend on core.internals.
6"""
7from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast
9import numpy as np
10import numpy.ma as ma
12from pandas._libs import lib
13from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime
14from pandas._typing import ArrayLike, Dtype
16from pandas.core.dtypes.cast import (
17 construct_1d_arraylike_from_scalar,
18 construct_1d_ndarray_preserving_na,
19 construct_1d_object_array_from_listlike,
20 infer_dtype_from_scalar,
21 maybe_cast_to_datetime,
22 maybe_cast_to_integer_array,
23 maybe_castable,
24 maybe_convert_platform,
25 maybe_upcast,
26)
27from pandas.core.dtypes.common import (
28 is_categorical_dtype,
29 is_datetime64_ns_dtype,
30 is_extension_array_dtype,
31 is_float_dtype,
32 is_integer_dtype,
33 is_iterator,
34 is_list_like,
35 is_object_dtype,
36 is_timedelta64_ns_dtype,
37 pandas_dtype,
38)
39from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype, registry
40from pandas.core.dtypes.generic import (
41 ABCExtensionArray,
42 ABCIndexClass,
43 ABCPandasArray,
44 ABCSeries,
45)
46from pandas.core.dtypes.missing import isna
48import pandas.core.common as com
50if TYPE_CHECKING:
51 from pandas.core.series import Series # noqa: F401
52 from pandas.core.indexes.api import Index # noqa: F401
55def array(
56 data: Sequence[object],
57 dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None,
58 copy: bool = True,
59) -> ABCExtensionArray:
60 """
61 Create an array.
63 .. versionadded:: 0.24.0
65 Parameters
66 ----------
67 data : Sequence of objects
68 The scalars inside `data` should be instances of the
69 scalar type for `dtype`. It's expected that `data`
70 represents a 1-dimensional array of data.
72 When `data` is an Index or Series, the underlying array
73 will be extracted from `data`.
75 dtype : str, np.dtype, or ExtensionDtype, optional
76 The dtype to use for the array. This may be a NumPy
77 dtype or an extension type registered with pandas using
78 :meth:`pandas.api.extensions.register_extension_dtype`.
80 If not specified, there are two possibilities:
82 1. When `data` is a :class:`Series`, :class:`Index`, or
83 :class:`ExtensionArray`, the `dtype` will be taken
84 from the data.
85 2. Otherwise, pandas will attempt to infer the `dtype`
86 from the data.
88 Note that when `data` is a NumPy array, ``data.dtype`` is
89 *not* used for inferring the array type. This is because
90 NumPy cannot represent all the types of data that can be
91 held in extension arrays.
93 Currently, pandas will infer an extension dtype for sequences of
95 ============================== =====================================
96 Scalar Type Array Type
97 ============================== =====================================
98 :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray`
99 :class:`pandas.Period` :class:`pandas.arrays.PeriodArray`
100 :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`
101 :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
102 :class:`int` :class:`pandas.arrays.IntegerArray`
103 :class:`str` :class:`pandas.arrays.StringArray`
104 :class:`bool` :class:`pandas.arrays.BooleanArray`
105 ============================== =====================================
107 For all other cases, NumPy's usual inference rules will be used.
109 .. versionchanged:: 1.0.0
111 Pandas infers nullable-integer dtype for integer data,
112 string dtype for string data, and nullable-boolean dtype
113 for boolean data.
115 copy : bool, default True
116 Whether to copy the data, even if not necessary. Depending
117 on the type of `data`, creating the new array may require
118 copying data, even if ``copy=False``.
120 Returns
121 -------
122 ExtensionArray
123 The newly created array.
125 Raises
126 ------
127 ValueError
128 When `data` is not 1-dimensional.
130 See Also
131 --------
132 numpy.array : Construct a NumPy array.
133 Series : Construct a pandas Series.
134 Index : Construct a pandas Index.
135 arrays.PandasArray : ExtensionArray wrapping a NumPy array.
136 Series.array : Extract the array stored within a Series.
138 Notes
139 -----
140 Omitting the `dtype` argument means pandas will attempt to infer the
141 best array type from the values in the data. As new array types are
142 added by pandas and 3rd party libraries, the "best" array type may
143 change. We recommend specifying `dtype` to ensure that
145 1. the correct array type for the data is returned
146 2. the returned array type doesn't change as new extension types
147 are added by pandas and third-party libraries
149 Additionally, if the underlying memory representation of the returned
150 array matters, we recommend specifying the `dtype` as a concrete object
151 rather than a string alias or allowing it to be inferred. For example,
152 a future version of pandas or a 3rd-party library may include a
153 dedicated ExtensionArray for string data. In this event, the following
154 would no longer return a :class:`arrays.PandasArray` backed by a NumPy
155 array.
157 >>> pd.array(['a', 'b'], dtype=str)
158 <PandasArray>
159 ['a', 'b']
160 Length: 2, dtype: str32
162 This would instead return the new ExtensionArray dedicated for string
163 data. If you really need the new array to be backed by a NumPy array,
164 specify that in the dtype.
166 >>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
167 <PandasArray>
168 ['a', 'b']
169 Length: 2, dtype: str32
171 Finally, Pandas has arrays that mostly overlap with NumPy
173 * :class:`arrays.DatetimeArray`
174 * :class:`arrays.TimedeltaArray`
176 When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
177 passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
178 rather than a ``PandasArray``. This is for symmetry with the case of
179 timezone-aware data, which NumPy does not natively support.
181 >>> pd.array(['2015', '2016'], dtype='datetime64[ns]')
182 <DatetimeArray>
183 ['2015-01-01 00:00:00', '2016-01-01 00:00:00']
184 Length: 2, dtype: datetime64[ns]
186 >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]')
187 <TimedeltaArray>
188 ['01:00:00', '02:00:00']
189 Length: 2, dtype: timedelta64[ns]
191 Examples
192 --------
193 If a dtype is not specified, pandas will infer the best dtype from the values.
194 See the description of `dtype` for the types pandas infers for.
196 >>> pd.array([1, 2])
197 <IntegerArray>
198 [1, 2]
199 Length: 2, dtype: Int64
201 >>> pd.array([1, 2, np.nan])
202 <IntegerArray>
203 [1, 2, NaN]
204 Length: 3, dtype: Int64
206 >>> pd.array(["a", None, "c"])
207 <StringArray>
208 ['a', nan, 'c']
209 Length: 3, dtype: string
211 >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
212 <PeriodArray>
213 ['2000-01-01', '2000-01-01']
214 Length: 2, dtype: period[D]
216 You can use the string alias for `dtype`
218 >>> pd.array(['a', 'b', 'a'], dtype='category')
219 [a, b, a]
220 Categories (2, object): [a, b]
222 Or specify the actual dtype
224 >>> pd.array(['a', 'b', 'a'],
225 ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
226 [a, b, a]
227 Categories (3, object): [a < b < c]
229 If pandas does not infer a dedicated extension type a
230 :class:`arrays.PandasArray` is returned.
232 >>> pd.array([1.1, 2.2])
233 <PandasArray>
234 [1.1, 2.2]
235 Length: 2, dtype: float64
237 As mentioned in the "Notes" section, new extension types may be added
238 in the future (by pandas or 3rd party libraries), causing the return
239 value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype`
240 as a NumPy dtype if you need to ensure there's no future change in
241 behavior.
243 >>> pd.array([1, 2], dtype=np.dtype("int32"))
244 <PandasArray>
245 [1, 2]
246 Length: 2, dtype: int32
248 `data` must be 1-dimensional. A ValueError is raised when the input
249 has the wrong dimensionality.
251 >>> pd.array(1)
252 Traceback (most recent call last):
253 ...
254 ValueError: Cannot pass scalar '1' to 'pandas.array'.
255 """
256 from pandas.core.arrays import (
257 period_array,
258 BooleanArray,
259 IntegerArray,
260 IntervalArray,
261 PandasArray,
262 DatetimeArray,
263 TimedeltaArray,
264 StringArray,
265 )
267 if lib.is_scalar(data):
268 msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
269 raise ValueError(msg)
271 if dtype is None and isinstance(
272 data, (ABCSeries, ABCIndexClass, ABCExtensionArray)
273 ):
274 dtype = data.dtype
276 data = extract_array(data, extract_numpy=True)
278 # this returns None for not-found dtypes.
279 if isinstance(dtype, str):
280 dtype = registry.find(dtype) or dtype
282 if is_extension_array_dtype(dtype):
283 cls = cast(ExtensionDtype, dtype).construct_array_type()
284 return cls._from_sequence(data, dtype=dtype, copy=copy)
286 if dtype is None:
287 inferred_dtype = lib.infer_dtype(data, skipna=True)
288 if inferred_dtype == "period":
289 try:
290 return period_array(data, copy=copy)
291 except IncompatibleFrequency:
292 # We may have a mixture of frequencies.
293 # We choose to return an ndarray, rather than raising.
294 pass
295 elif inferred_dtype == "interval":
296 try:
297 return IntervalArray(data, copy=copy)
298 except ValueError:
299 # We may have a mixture of `closed` here.
300 # We choose to return an ndarray, rather than raising.
301 pass
303 elif inferred_dtype.startswith("datetime"):
304 # datetime, datetime64
305 try:
306 return DatetimeArray._from_sequence(data, copy=copy)
307 except ValueError:
308 # Mixture of timezones, fall back to PandasArray
309 pass
311 elif inferred_dtype.startswith("timedelta"):
312 # timedelta, timedelta64
313 return TimedeltaArray._from_sequence(data, copy=copy)
315 elif inferred_dtype == "string":
316 return StringArray._from_sequence(data, copy=copy)
318 elif inferred_dtype == "integer":
319 return IntegerArray._from_sequence(data, copy=copy)
321 elif inferred_dtype == "boolean":
322 return BooleanArray._from_sequence(data, copy=copy)
324 # Pandas overrides NumPy for
325 # 1. datetime64[ns]
326 # 2. timedelta64[ns]
327 # so that a DatetimeArray is returned.
328 if is_datetime64_ns_dtype(dtype):
329 return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
330 elif is_timedelta64_ns_dtype(dtype):
331 return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)
333 result = PandasArray._from_sequence(data, dtype=dtype, copy=copy)
334 return result
337def extract_array(obj, extract_numpy=False):
338 """
339 Extract the ndarray or ExtensionArray from a Series or Index.
341 For all other types, `obj` is just returned as is.
343 Parameters
344 ----------
345 obj : object
346 For Series / Index, the underlying ExtensionArray is unboxed.
347 For Numpy-backed ExtensionArrays, the ndarray is extracted.
349 extract_numpy : bool, default False
350 Whether to extract the ndarray from a PandasArray
352 Returns
353 -------
354 arr : object
356 Examples
357 --------
358 >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
359 [a, b, c]
360 Categories (3, object): [a, b, c]
362 Other objects like lists, arrays, and DataFrames are just passed through.
364 >>> extract_array([1, 2, 3])
365 [1, 2, 3]
367 For an ndarray-backed Series / Index a PandasArray is returned.
369 >>> extract_array(pd.Series([1, 2, 3]))
370 <PandasArray>
371 [1, 2, 3]
372 Length: 3, dtype: int64
374 To extract all the way down to the ndarray, pass ``extract_numpy=True``.
376 >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True)
377 array([1, 2, 3])
378 """
379 if isinstance(obj, (ABCIndexClass, ABCSeries)):
380 obj = obj.array
382 if extract_numpy and isinstance(obj, ABCPandasArray):
383 obj = obj.to_numpy()
385 return obj
388def sanitize_array(
389 data, index, dtype=None, copy: bool = False, raise_cast_failure: bool = False
390):
391 """
392 Sanitize input data to an ndarray, copy if specified, coerce to the
393 dtype if specified.
394 """
395 if dtype is not None:
396 dtype = pandas_dtype(dtype)
398 if isinstance(data, ma.MaskedArray):
399 mask = ma.getmaskarray(data)
400 if mask.any():
401 data, fill_value = maybe_upcast(data, copy=True)
402 data.soften_mask() # set hardmask False if it was True
403 data[mask] = fill_value
404 else:
405 data = data.copy()
407 # extract ndarray or ExtensionArray, ensure we have no PandasArray
408 data = extract_array(data, extract_numpy=True)
410 # GH#846
411 if isinstance(data, np.ndarray):
413 if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype):
414 # possibility of nan -> garbage
415 try:
416 subarr = _try_cast(data, dtype, copy, True)
417 except ValueError:
418 if copy:
419 subarr = data.copy()
420 else:
421 subarr = np.array(data, copy=False)
422 else:
423 # we will try to copy be-definition here
424 subarr = _try_cast(data, dtype, copy, raise_cast_failure)
426 elif isinstance(data, ABCExtensionArray):
427 # it is already ensured above this is not a PandasArray
428 subarr = data
430 if dtype is not None:
431 subarr = subarr.astype(dtype, copy=copy)
432 elif copy:
433 subarr = subarr.copy()
434 return subarr
436 elif isinstance(data, (list, tuple)) and len(data) > 0:
437 if dtype is not None:
438 subarr = _try_cast(data, dtype, copy, raise_cast_failure)
439 else:
440 subarr = maybe_convert_platform(data)
442 subarr = maybe_cast_to_datetime(subarr, dtype)
444 elif isinstance(data, range):
445 # GH#16804
446 arr = np.arange(data.start, data.stop, data.step, dtype="int64")
447 subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
448 else:
449 subarr = _try_cast(data, dtype, copy, raise_cast_failure)
451 # scalar like, GH
452 if getattr(subarr, "ndim", 0) == 0:
453 if isinstance(data, list): # pragma: no cover
454 subarr = np.array(data, dtype=object)
455 elif index is not None:
456 value = data
458 # figure out the dtype from the value (upcast if necessary)
459 if dtype is None:
460 dtype, value = infer_dtype_from_scalar(value)
461 else:
462 # need to possibly convert the value here
463 value = maybe_cast_to_datetime(value, dtype)
465 subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype)
467 else:
468 return subarr.item()
470 # the result that we want
471 elif subarr.ndim == 1:
472 if index is not None:
474 # a 1-element ndarray
475 if len(subarr) != len(index) and len(subarr) == 1:
476 subarr = construct_1d_arraylike_from_scalar(
477 subarr[0], len(index), subarr.dtype
478 )
480 elif subarr.ndim > 1:
481 if isinstance(data, np.ndarray):
482 raise Exception("Data must be 1-dimensional")
483 else:
484 subarr = com.asarray_tuplesafe(data, dtype=dtype)
486 if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)):
487 # This is to prevent mixed-type Series getting all casted to
488 # NumPy string type, e.g. NaN --> '-1#IND'.
489 if issubclass(subarr.dtype.type, str):
490 # GH#16605
491 # If not empty convert the data to dtype
492 # GH#19853: If data is a scalar, subarr has already the result
493 if not lib.is_scalar(data):
494 if not np.all(isna(data)):
495 data = np.array(data, dtype=dtype, copy=False)
496 subarr = np.array(data, dtype=object, copy=copy)
498 if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype):
499 inferred = lib.infer_dtype(subarr, skipna=False)
500 if inferred in {"interval", "period"}:
501 subarr = array(subarr)
503 return subarr
506def _try_cast(
507 arr,
508 dtype: Optional[Union[np.dtype, "ExtensionDtype"]],
509 copy: bool,
510 raise_cast_failure: bool,
511):
512 """
513 Convert input to numpy ndarray and optionally cast to a given dtype.
515 Parameters
516 ----------
517 arr : ndarray, list, tuple, iterator (catchall)
518 Excludes: ExtensionArray, Series, Index.
519 dtype : np.dtype, ExtensionDtype or None
520 copy : bool
521 If False, don't copy the data if not needed.
522 raise_cast_failure : bool
523 If True, and if a dtype is specified, raise errors during casting.
524 Otherwise an object array is returned.
525 """
526 # perf shortcut as this is the most common case
527 if isinstance(arr, np.ndarray):
528 if maybe_castable(arr) and not copy and dtype is None:
529 return arr
531 try:
532 # GH#15832: Check if we are requesting a numeric dype and
533 # that we can convert the data to the requested dtype.
534 if is_integer_dtype(dtype):
535 subarr = maybe_cast_to_integer_array(arr, dtype)
537 subarr = maybe_cast_to_datetime(arr, dtype)
538 # Take care in creating object arrays (but iterators are not
539 # supported):
540 if is_object_dtype(dtype) and (
541 is_list_like(subarr)
542 and not (is_iterator(subarr) or isinstance(subarr, np.ndarray))
543 ):
544 subarr = construct_1d_object_array_from_listlike(subarr)
545 elif not is_extension_array_dtype(subarr):
546 subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy)
547 except OutOfBoundsDatetime:
548 # in case of out of bound datetime64 -> always raise
549 raise
550 except (ValueError, TypeError):
551 if is_categorical_dtype(dtype):
552 # We *do* allow casting to categorical, since we know
553 # that Categorical is the only array type for 'category'.
554 dtype = cast(CategoricalDtype, dtype)
555 subarr = dtype.construct_array_type()(
556 arr, dtype.categories, ordered=dtype.ordered
557 )
558 elif is_extension_array_dtype(dtype):
559 # create an extension array from its dtype
560 dtype = cast(ExtensionDtype, dtype)
561 array_type = dtype.construct_array_type()._from_sequence
562 subarr = array_type(arr, dtype=dtype, copy=copy)
563 elif dtype is not None and raise_cast_failure:
564 raise
565 else:
566 subarr = np.array(arr, dtype=object, copy=copy)
567 return subarr
570def is_empty_data(data: Any) -> bool:
571 """
572 Utility to check if a Series is instantiated with empty data,
573 which does not contain dtype information.
575 Parameters
576 ----------
577 data : array-like, Iterable, dict, or scalar value
578 Contains data stored in Series.
580 Returns
581 -------
582 bool
583 """
584 is_none = data is None
585 is_list_like_without_dtype = is_list_like(data) and not hasattr(data, "dtype")
586 is_simple_empty = is_list_like_without_dtype and not data
587 return is_none or is_simple_empty
590def create_series_with_explicit_dtype(
591 data: Any = None,
592 index: Optional[Union[ArrayLike, "Index"]] = None,
593 dtype: Optional[Dtype] = None,
594 name: Optional[str] = None,
595 copy: bool = False,
596 fastpath: bool = False,
597 dtype_if_empty: Dtype = object,
598) -> "Series":
599 """
600 Helper to pass an explicit dtype when instantiating an empty Series.
602 This silences a DeprecationWarning described in GitHub-17261.
604 Parameters
605 ----------
606 data : Mirrored from Series.__init__
607 index : Mirrored from Series.__init__
608 dtype : Mirrored from Series.__init__
609 name : Mirrored from Series.__init__
610 copy : Mirrored from Series.__init__
611 fastpath : Mirrored from Series.__init__
612 dtype_if_empty : str, numpy.dtype, or ExtensionDtype
613 This dtype will be passed explicitly if an empty Series will
614 be instantiated.
616 Returns
617 -------
618 Series
619 """
620 from pandas.core.series import Series
622 if is_empty_data(data) and dtype is None:
623 dtype = dtype_if_empty
624 return Series(
625 data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath
626 )