Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Quantilization functions and related stuff 

3""" 

4import numpy as np 

5 

6from pandas._libs import Timedelta, Timestamp 

7from pandas._libs.lib import infer_dtype 

8 

9from pandas.core.dtypes.common import ( 

10 _NS_DTYPE, 

11 ensure_int64, 

12 is_bool_dtype, 

13 is_categorical_dtype, 

14 is_datetime64_dtype, 

15 is_datetime64tz_dtype, 

16 is_datetime_or_timedelta_dtype, 

17 is_extension_array_dtype, 

18 is_integer, 

19 is_integer_dtype, 

20 is_list_like, 

21 is_scalar, 

22 is_timedelta64_dtype, 

23) 

24from pandas.core.dtypes.generic import ABCSeries 

25from pandas.core.dtypes.missing import isna 

26 

27from pandas import Categorical, Index, IntervalIndex, to_datetime, to_timedelta 

28import pandas.core.algorithms as algos 

29import pandas.core.nanops as nanops 

30 

31 

32def cut( 

33 x, 

34 bins, 

35 right: bool = True, 

36 labels=None, 

37 retbins: bool = False, 

38 precision: int = 3, 

39 include_lowest: bool = False, 

40 duplicates: str = "raise", 

41): 

42 """ 

43 Bin values into discrete intervals. 

44 

45 Use `cut` when you need to segment and sort data values into bins. This 

46 function is also useful for going from a continuous variable to a 

47 categorical variable. For example, `cut` could convert ages to groups of 

48 age ranges. Supports binning into an equal number of bins, or a 

49 pre-specified array of bins. 

50 

51 Parameters 

52 ---------- 

53 x : array-like 

54 The input array to be binned. Must be 1-dimensional. 

55 bins : int, sequence of scalars, or IntervalIndex 

56 The criteria to bin by. 

57 

58 * int : Defines the number of equal-width bins in the range of `x`. The 

59 range of `x` is extended by .1% on each side to include the minimum 

60 and maximum values of `x`. 

61 * sequence of scalars : Defines the bin edges allowing for non-uniform 

62 width. No extension of the range of `x` is done. 

63 * IntervalIndex : Defines the exact bins to be used. Note that 

64 IntervalIndex for `bins` must be non-overlapping. 

65 

66 right : bool, default True 

67 Indicates whether `bins` includes the rightmost edge or not. If 

68 ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` 

69 indicate (1,2], (2,3], (3,4]. This argument is ignored when 

70 `bins` is an IntervalIndex. 

71 labels : array or False, default None 

72 Specifies the labels for the returned bins. Must be the same length as 

73 the resulting bins. If False, returns only integer indicators of the 

74 bins. This affects the type of the output container (see below). 

75 This argument is ignored when `bins` is an IntervalIndex. If True, 

76 raises an error. 

77 retbins : bool, default False 

78 Whether to return the bins or not. Useful when bins is provided 

79 as a scalar. 

80 precision : int, default 3 

81 The precision at which to store and display the bins labels. 

82 include_lowest : bool, default False 

83 Whether the first interval should be left-inclusive or not. 

84 duplicates : {default 'raise', 'drop'}, optional 

85 If bin edges are not unique, raise ValueError or drop non-uniques. 

86 

87 .. versionadded:: 0.23.0 

88 

89 Returns 

90 ------- 

91 out : Categorical, Series, or ndarray 

92 An array-like object representing the respective bin for each value 

93 of `x`. The type depends on the value of `labels`. 

94 

95 * True (default) : returns a Series for Series `x` or a 

96 Categorical for all other inputs. The values stored within 

97 are Interval dtype. 

98 

99 * sequence of scalars : returns a Series for Series `x` or a 

100 Categorical for all other inputs. The values stored within 

101 are whatever the type in the sequence is. 

102 

103 * False : returns an ndarray of integers. 

104 

105 bins : numpy.ndarray or IntervalIndex. 

106 The computed or specified bins. Only returned when `retbins=True`. 

107 For scalar or sequence `bins`, this is an ndarray with the computed 

108 bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For 

109 an IntervalIndex `bins`, this is equal to `bins`. 

110 

111 See Also 

112 -------- 

113 qcut : Discretize variable into equal-sized buckets based on rank 

114 or based on sample quantiles. 

115 Categorical : Array type for storing data that come from a 

116 fixed set of values. 

117 Series : One-dimensional array with axis labels (including time series). 

118 IntervalIndex : Immutable Index implementing an ordered, sliceable set. 

119 

120 Notes 

121 ----- 

122 Any NA values will be NA in the result. Out of bounds values will be NA in 

123 the resulting Series or Categorical object. 

124 

125 Examples 

126 -------- 

127 Discretize into three equal-sized bins. 

128 

129 >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) 

130 ... # doctest: +ELLIPSIS 

131 [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... 

132 Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... 

133 

134 >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) 

135 ... # doctest: +ELLIPSIS 

136 ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... 

137 Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... 

138 array([0.994, 3. , 5. , 7. ])) 

139 

140 Discovers the same bins, but assign them specific labels. Notice that 

141 the returned Categorical's categories are `labels` and is ordered. 

142 

143 >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 

144 ... 3, labels=["bad", "medium", "good"]) 

145 [bad, good, medium, medium, good, bad] 

146 Categories (3, object): [bad < medium < good] 

147 

148 ``labels=False`` implies you just want the bins back. 

149 

150 >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) 

151 array([0, 1, 1, 3]) 

152 

153 Passing a Series as an input returns a Series with categorical dtype: 

154 

155 >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), 

156 ... index=['a', 'b', 'c', 'd', 'e']) 

157 >>> pd.cut(s, 3) 

158 ... # doctest: +ELLIPSIS 

159 a (1.992, 4.667] 

160 b (1.992, 4.667] 

161 c (4.667, 7.333] 

162 d (7.333, 10.0] 

163 e (7.333, 10.0] 

164 dtype: category 

165 Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... 

166 

167 Passing a Series as an input returns a Series with mapping value. 

168 It is used to map numerically to intervals based on bins. 

169 

170 >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), 

171 ... index=['a', 'b', 'c', 'd', 'e']) 

172 >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False) 

173 ... # doctest: +ELLIPSIS 

174 (a 0.0 

175 b 1.0 

176 c 2.0 

177 d 3.0 

178 e 4.0 

179 dtype: float64, array([0, 2, 4, 6, 8])) 

180 

181 Use `drop` optional when bins is not unique 

182 

183 >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, 

184 ... right=False, duplicates='drop') 

185 ... # doctest: +ELLIPSIS 

186 (a 0.0 

187 b 1.0 

188 c 2.0 

189 d 3.0 

190 e 3.0 

191 dtype: float64, array([0, 2, 4, 6, 8])) 

192 

193 Passing an IntervalIndex for `bins` results in those categories exactly. 

194 Notice that values not covered by the IntervalIndex are set to NaN. 0 

195 is to the left of the first bin (which is closed on the right), and 1.5 

196 falls between two bins. 

197 

198 >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) 

199 >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) 

200 [NaN, (0, 1], NaN, (2, 3], (4, 5]] 

201 Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] 

202 """ 

203 # NOTE: this binning code is changed a bit from histogram for var(x) == 0 

204 

205 original = x 

206 x = _preprocess_for_cut(x) 

207 x, dtype = _coerce_to_type(x) 

208 

209 if not np.iterable(bins): 

210 if is_scalar(bins) and bins < 1: 

211 raise ValueError("`bins` should be a positive integer.") 

212 

213 try: # for array-like 

214 sz = x.size 

215 except AttributeError: 

216 x = np.asarray(x) 

217 sz = x.size 

218 

219 if sz == 0: 

220 raise ValueError("Cannot cut empty array") 

221 

222 rng = (nanops.nanmin(x), nanops.nanmax(x)) 

223 mn, mx = [mi + 0.0 for mi in rng] 

224 

225 if np.isinf(mn) or np.isinf(mx): 

226 # GH 24314 

227 raise ValueError( 

228 "cannot specify integer `bins` when input data contains infinity" 

229 ) 

230 elif mn == mx: # adjust end points before binning 

231 mn -= 0.001 * abs(mn) if mn != 0 else 0.001 

232 mx += 0.001 * abs(mx) if mx != 0 else 0.001 

233 bins = np.linspace(mn, mx, bins + 1, endpoint=True) 

234 else: # adjust end points after binning 

235 bins = np.linspace(mn, mx, bins + 1, endpoint=True) 

236 adj = (mx - mn) * 0.001 # 0.1% of the range 

237 if right: 

238 bins[0] -= adj 

239 else: 

240 bins[-1] += adj 

241 

242 elif isinstance(bins, IntervalIndex): 

243 if bins.is_overlapping: 

244 raise ValueError("Overlapping IntervalIndex is not accepted.") 

245 

246 else: 

247 if is_datetime64tz_dtype(bins): 

248 bins = np.asarray(bins, dtype=_NS_DTYPE) 

249 else: 

250 bins = np.asarray(bins) 

251 bins = _convert_bin_to_numeric_type(bins, dtype) 

252 

253 # GH 26045: cast to float64 to avoid an overflow 

254 if (np.diff(bins.astype("float64")) < 0).any(): 

255 raise ValueError("bins must increase monotonically.") 

256 

257 fac, bins = _bins_to_cuts( 

258 x, 

259 bins, 

260 right=right, 

261 labels=labels, 

262 precision=precision, 

263 include_lowest=include_lowest, 

264 dtype=dtype, 

265 duplicates=duplicates, 

266 ) 

267 

268 return _postprocess_for_cut(fac, bins, retbins, dtype, original) 

269 

270 

271def qcut( 

272 x, 

273 q, 

274 labels=None, 

275 retbins: bool = False, 

276 precision: int = 3, 

277 duplicates: str = "raise", 

278): 

279 """ 

280 Quantile-based discretization function. 

281 

282 Discretize variable into equal-sized buckets based on rank or based 

283 on sample quantiles. For example 1000 values for 10 quantiles would 

284 produce a Categorical object indicating quantile membership for each data point. 

285 

286 Parameters 

287 ---------- 

288 x : 1d ndarray or Series 

289 q : int or list-like of int 

290 Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately 

291 array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. 

292 labels : array or False, default None 

293 Used as labels for the resulting bins. Must be of the same length as 

294 the resulting bins. If False, return only integer indicators of the 

295 bins. If True, raises an error. 

296 retbins : bool, optional 

297 Whether to return the (bins, labels) or not. Can be useful if bins 

298 is given as a scalar. 

299 precision : int, optional 

300 The precision at which to store and display the bins labels. 

301 duplicates : {default 'raise', 'drop'}, optional 

302 If bin edges are not unique, raise ValueError or drop non-uniques. 

303 

304 Returns 

305 ------- 

306 out : Categorical or Series or array of integers if labels is False 

307 The return type (Categorical or Series) depends on the input: a Series 

308 of type category if input is a Series else Categorical. Bins are 

309 represented as categories when categorical data is returned. 

310 bins : ndarray of floats 

311 Returned only if `retbins` is True. 

312 

313 Notes 

314 ----- 

315 Out of bounds values will be NA in the resulting Categorical object 

316 

317 Examples 

318 -------- 

319 >>> pd.qcut(range(5), 4) 

320 ... # doctest: +ELLIPSIS 

321 [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] 

322 Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ... 

323 

324 >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"]) 

325 ... # doctest: +SKIP 

326 [good, good, medium, bad, bad] 

327 Categories (3, object): [good < medium < bad] 

328 

329 >>> pd.qcut(range(5), 4, labels=False) 

330 array([0, 0, 1, 2, 3]) 

331 """ 

332 original = x 

333 x = _preprocess_for_cut(x) 

334 x, dtype = _coerce_to_type(x) 

335 

336 if is_integer(q): 

337 quantiles = np.linspace(0, 1, q + 1) 

338 else: 

339 quantiles = q 

340 bins = algos.quantile(x, quantiles) 

341 fac, bins = _bins_to_cuts( 

342 x, 

343 bins, 

344 labels=labels, 

345 precision=precision, 

346 include_lowest=True, 

347 dtype=dtype, 

348 duplicates=duplicates, 

349 ) 

350 

351 return _postprocess_for_cut(fac, bins, retbins, dtype, original) 

352 

353 

354def _bins_to_cuts( 

355 x, 

356 bins, 

357 right: bool = True, 

358 labels=None, 

359 precision: int = 3, 

360 include_lowest: bool = False, 

361 dtype=None, 

362 duplicates: str = "raise", 

363): 

364 

365 if duplicates not in ["raise", "drop"]: 

366 raise ValueError( 

367 "invalid value for 'duplicates' parameter, " 

368 "valid options are: raise, drop" 

369 ) 

370 

371 if isinstance(bins, IntervalIndex): 

372 # we have a fast-path here 

373 ids = bins.get_indexer(x) 

374 result = Categorical.from_codes(ids, categories=bins, ordered=True) 

375 return result, bins 

376 

377 unique_bins = algos.unique(bins) 

378 if len(unique_bins) < len(bins) and len(bins) != 2: 

379 if duplicates == "raise": 

380 raise ValueError( 

381 f"Bin edges must be unique: {repr(bins)}.\n" 

382 f"You can drop duplicate edges by setting the 'duplicates' kwarg" 

383 ) 

384 else: 

385 bins = unique_bins 

386 

387 side = "left" if right else "right" 

388 ids = ensure_int64(bins.searchsorted(x, side=side)) 

389 

390 if include_lowest: 

391 ids[x == bins[0]] = 1 

392 

393 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) 

394 has_nas = na_mask.any() 

395 

396 if labels is not False: 

397 if not (labels is None or is_list_like(labels)): 

398 raise ValueError( 

399 "Bin labels must either be False, None or passed in as a " 

400 "list-like argument" 

401 ) 

402 

403 elif labels is None: 

404 labels = _format_labels( 

405 bins, precision, right=right, include_lowest=include_lowest, dtype=dtype 

406 ) 

407 

408 else: 

409 if len(labels) != len(bins) - 1: 

410 raise ValueError( 

411 "Bin labels must be one fewer than the number of bin edges" 

412 ) 

413 

414 if not is_categorical_dtype(labels): 

415 labels = Categorical(labels, categories=labels, ordered=True) 

416 

417 np.putmask(ids, na_mask, 0) 

418 result = algos.take_nd(labels, ids - 1) 

419 

420 else: 

421 result = ids - 1 

422 if has_nas: 

423 result = result.astype(np.float64) 

424 np.putmask(result, na_mask, np.nan) 

425 

426 return result, bins 

427 

428 

429def _coerce_to_type(x): 

430 """ 

431 if the passed data is of datetime/timedelta, bool or nullable int type, 

432 this method converts it to numeric so that cut or qcut method can 

433 handle it 

434 """ 

435 dtype = None 

436 

437 if is_datetime64tz_dtype(x): 

438 dtype = x.dtype 

439 elif is_datetime64_dtype(x): 

440 x = to_datetime(x) 

441 dtype = np.dtype("datetime64[ns]") 

442 elif is_timedelta64_dtype(x): 

443 x = to_timedelta(x) 

444 dtype = np.dtype("timedelta64[ns]") 

445 elif is_bool_dtype(x): 

446 # GH 20303 

447 x = x.astype(np.int64) 

448 # To support cut and qcut for IntegerArray we convert to float dtype. 

449 # Will properly support in the future. 

450 # https://github.com/pandas-dev/pandas/pull/31290 

451 # https://github.com/pandas-dev/pandas/issues/31389 

452 elif is_extension_array_dtype(x) and is_integer_dtype(x): 

453 x = x.to_numpy(dtype=np.float64, na_value=np.nan) 

454 

455 if dtype is not None: 

456 # GH 19768: force NaT to NaN during integer conversion 

457 x = np.where(x.notna(), x.view(np.int64), np.nan) 

458 

459 return x, dtype 

460 

461 

462def _convert_bin_to_numeric_type(bins, dtype): 

463 """ 

464 if the passed bin is of datetime/timedelta type, 

465 this method converts it to integer 

466 

467 Parameters 

468 ---------- 

469 bins : list-like of bins 

470 dtype : dtype of data 

471 

472 Raises 

473 ------ 

474 ValueError if bins are not of a compat dtype to dtype 

475 """ 

476 bins_dtype = infer_dtype(bins, skipna=False) 

477 if is_timedelta64_dtype(dtype): 

478 if bins_dtype in ["timedelta", "timedelta64"]: 

479 bins = to_timedelta(bins).view(np.int64) 

480 else: 

481 raise ValueError("bins must be of timedelta64 dtype") 

482 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): 

483 if bins_dtype in ["datetime", "datetime64"]: 

484 bins = to_datetime(bins).view(np.int64) 

485 else: 

486 raise ValueError("bins must be of datetime64 dtype") 

487 

488 return bins 

489 

490 

491def _convert_bin_to_datelike_type(bins, dtype): 

492 """ 

493 Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is 

494 datelike 

495 

496 Parameters 

497 ---------- 

498 bins : list-like of bins 

499 dtype : dtype of data 

500 

501 Returns 

502 ------- 

503 bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is 

504 datelike 

505 """ 

506 if is_datetime64tz_dtype(dtype): 

507 bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz) 

508 elif is_datetime_or_timedelta_dtype(dtype): 

509 bins = Index(bins.astype(np.int64), dtype=dtype) 

510 return bins 

511 

512 

513def _format_labels( 

514 bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None 

515): 

516 """ based on the dtype, return our labels """ 

517 

518 closed = "right" if right else "left" 

519 

520 if is_datetime64tz_dtype(dtype): 

521 formatter = lambda x: Timestamp(x, tz=dtype.tz) 

522 adjust = lambda x: x - Timedelta("1ns") 

523 elif is_datetime64_dtype(dtype): 

524 formatter = Timestamp 

525 adjust = lambda x: x - Timedelta("1ns") 

526 elif is_timedelta64_dtype(dtype): 

527 formatter = Timedelta 

528 adjust = lambda x: x - Timedelta("1ns") 

529 else: 

530 precision = _infer_precision(precision, bins) 

531 formatter = lambda x: _round_frac(x, precision) 

532 adjust = lambda x: x - 10 ** (-precision) 

533 

534 breaks = [formatter(b) for b in bins] 

535 if right and include_lowest: 

536 # adjust lhs of first interval by precision to account for being right closed 

537 breaks[0] = adjust(breaks[0]) 

538 

539 return IntervalIndex.from_breaks(breaks, closed=closed) 

540 

541 

542def _preprocess_for_cut(x): 

543 """ 

544 handles preprocessing for cut where we convert passed 

545 input to array, strip the index information and store it 

546 separately 

547 """ 

548 

549 # Check that the passed array is a Pandas or Numpy object 

550 # We don't want to strip away a Pandas data-type here (e.g. datetimetz) 

551 ndim = getattr(x, "ndim", None) 

552 if ndim is None: 

553 x = np.asarray(x) 

554 if x.ndim != 1: 

555 raise ValueError("Input array must be 1 dimensional") 

556 

557 return x 

558 

559 

560def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original): 

561 """ 

562 handles post processing for the cut method where 

563 we combine the index information if the originally passed 

564 datatype was a series 

565 """ 

566 if isinstance(original, ABCSeries): 

567 fac = original._constructor(fac, index=original.index, name=original.name) 

568 

569 if not retbins: 

570 return fac 

571 

572 bins = _convert_bin_to_datelike_type(bins, dtype) 

573 

574 return fac, bins 

575 

576 

577def _round_frac(x, precision: int): 

578 """ 

579 Round the fractional part of the given number 

580 """ 

581 if not np.isfinite(x) or x == 0: 

582 return x 

583 else: 

584 frac, whole = np.modf(x) 

585 if whole == 0: 

586 digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision 

587 else: 

588 digits = precision 

589 return np.around(x, digits) 

590 

591 

592def _infer_precision(base_precision: int, bins) -> int: 

593 """Infer an appropriate precision for _round_frac 

594 """ 

595 for precision in range(base_precision, 20): 

596 levels = [_round_frac(b, precision) for b in bins] 

597 if algos.unique(levels).size == bins.size: 

598 return precision 

599 return base_precision # default