Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/reshape/merge.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2SQL-style merge routines
3"""
5import copy
6import datetime
7from functools import partial
8import string
9from typing import TYPE_CHECKING, Optional, Tuple, Union
10import warnings
12import numpy as np
14from pandas._libs import Timedelta, hashtable as libhashtable, lib
15import pandas._libs.join as libjoin
16from pandas._typing import FrameOrSeries
17from pandas.errors import MergeError
18from pandas.util._decorators import Appender, Substitution
20from pandas.core.dtypes.common import (
21 ensure_float64,
22 ensure_int64,
23 ensure_object,
24 is_array_like,
25 is_bool,
26 is_bool_dtype,
27 is_categorical_dtype,
28 is_datetime64tz_dtype,
29 is_dtype_equal,
30 is_extension_array_dtype,
31 is_float_dtype,
32 is_integer,
33 is_integer_dtype,
34 is_list_like,
35 is_number,
36 is_numeric_dtype,
37 is_object_dtype,
38 needs_i8_conversion,
39)
40from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
41from pandas.core.dtypes.missing import isna, na_value_for_dtype
43from pandas import Categorical, Index, MultiIndex
44from pandas.core import groupby
45import pandas.core.algorithms as algos
46from pandas.core.arrays.categorical import _recode_for_categories
47import pandas.core.common as com
48from pandas.core.frame import _merge_doc
49from pandas.core.internals import _transform_index, concatenate_block_managers
50from pandas.core.sorting import is_int64_overflow_possible
52if TYPE_CHECKING:
53 from pandas import DataFrame, Series # noqa:F401
56@Substitution("\nleft : DataFrame")
57@Appender(_merge_doc, indents=0)
58def merge(
59 left,
60 right,
61 how: str = "inner",
62 on=None,
63 left_on=None,
64 right_on=None,
65 left_index: bool = False,
66 right_index: bool = False,
67 sort: bool = False,
68 suffixes=("_x", "_y"),
69 copy: bool = True,
70 indicator: bool = False,
71 validate=None,
72) -> "DataFrame":
73 op = _MergeOperation(
74 left,
75 right,
76 how=how,
77 on=on,
78 left_on=left_on,
79 right_on=right_on,
80 left_index=left_index,
81 right_index=right_index,
82 sort=sort,
83 suffixes=suffixes,
84 copy=copy,
85 indicator=indicator,
86 validate=validate,
87 )
88 return op.get_result()
91if __debug__:
92 merge.__doc__ = _merge_doc % "\nleft : DataFrame"
95def _groupby_and_merge(
96 by, on, left, right: "DataFrame", _merge_pieces, check_duplicates: bool = True
97):
98 """
99 groupby & merge; we are always performing a left-by type operation
101 Parameters
102 ----------
103 by: field to group
104 on: duplicates field
105 left: left frame
106 right: right frame
107 _merge_pieces: function for merging
108 check_duplicates: bool, default True
109 should we check & clean duplicates
110 """
112 pieces = []
113 if not isinstance(by, (list, tuple)):
114 by = [by]
116 lby = left.groupby(by, sort=False)
117 rby: Optional[groupby.DataFrameGroupBy] = None
119 # if we can groupby the rhs
120 # then we can get vastly better perf
122 # we will check & remove duplicates if indicated
123 if check_duplicates:
124 if on is None:
125 on = []
126 elif not isinstance(on, (list, tuple)):
127 on = [on]
129 if right.duplicated(by + on).any():
130 _right = right.drop_duplicates(by + on, keep="last")
131 # TODO: use overload to refine return type of drop_duplicates
132 assert _right is not None # needed for mypy
133 right = _right
134 try:
135 rby = right.groupby(by, sort=False)
136 except KeyError:
137 pass
139 for key, lhs in lby:
141 if rby is None:
142 rhs = right
143 else:
144 try:
145 rhs = right.take(rby.indices[key])
146 except KeyError:
147 # key doesn't exist in left
148 lcols = lhs.columns.tolist()
149 cols = lcols + [r for r in right.columns if r not in set(lcols)]
150 merged = lhs.reindex(columns=cols)
151 merged.index = range(len(merged))
152 pieces.append(merged)
153 continue
155 merged = _merge_pieces(lhs, rhs)
157 # make sure join keys are in the merged
158 # TODO, should _merge_pieces do this?
159 for k in by:
160 try:
161 if k in merged:
162 merged[k] = key
163 except KeyError:
164 pass
166 pieces.append(merged)
168 # preserve the original order
169 # if we have a missing piece this can be reset
170 from pandas.core.reshape.concat import concat
172 result = concat(pieces, ignore_index=True)
173 result = result.reindex(columns=pieces[0].columns, copy=False)
174 return result, lby
177def merge_ordered(
178 left,
179 right,
180 on=None,
181 left_on=None,
182 right_on=None,
183 left_by=None,
184 right_by=None,
185 fill_method=None,
186 suffixes=("_x", "_y"),
187 how: str = "outer",
188) -> "DataFrame":
189 """
190 Perform merge with optional filling/interpolation.
192 Designed for ordered data like time series data. Optionally
193 perform group-wise merge (see examples).
195 Parameters
196 ----------
197 left : DataFrame
198 right : DataFrame
199 on : label or list
200 Field names to join on. Must be found in both DataFrames.
201 left_on : label or list, or array-like
202 Field names to join on in left DataFrame. Can be a vector or list of
203 vectors of the length of the DataFrame to use a particular vector as
204 the join key instead of columns.
205 right_on : label or list, or array-like
206 Field names to join on in right DataFrame or vector/list of vectors per
207 left_on docs.
208 left_by : column name or list of column names
209 Group left DataFrame by group columns and merge piece by piece with
210 right DataFrame.
211 right_by : column name or list of column names
212 Group right DataFrame by group columns and merge piece by piece with
213 left DataFrame.
214 fill_method : {'ffill', None}, default None
215 Interpolation method for data.
216 suffixes : Sequence, default is ("_x", "_y")
217 A length-2 sequence where each element is optionally a string
218 indicating the suffix to add to overlapping column names in
219 `left` and `right` respectively. Pass a value of `None` instead
220 of a string to indicate that the column name from `left` or
221 `right` should be left as-is, with no suffix. At least one of the
222 values must not be None.
224 .. versionchanged:: 0.25.0
225 how : {'left', 'right', 'outer', 'inner'}, default 'outer'
226 * left: use only keys from left frame (SQL: left outer join)
227 * right: use only keys from right frame (SQL: right outer join)
228 * outer: use union of keys from both frames (SQL: full outer join)
229 * inner: use intersection of keys from both frames (SQL: inner join).
231 Returns
232 -------
233 DataFrame
234 The merged DataFrame output type will the be same as
235 'left', if it is a subclass of DataFrame.
237 See Also
238 --------
239 merge
240 merge_asof
242 Examples
243 --------
244 >>> A
245 key lvalue group
246 0 a 1 a
247 1 c 2 a
248 2 e 3 a
249 3 a 1 b
250 4 c 2 b
251 5 e 3 b
253 >>> B
254 Key rvalue
255 0 b 1
256 1 c 2
257 2 d 3
259 >>> merge_ordered(A, B, fill_method='ffill', left_by='group')
260 group key lvalue rvalue
261 0 a a 1 NaN
262 1 a b 1 1.0
263 2 a c 2 2.0
264 3 a d 2 3.0
265 4 a e 3 3.0
266 5 b a 1 NaN
267 6 b b 1 1.0
268 7 b c 2 2.0
269 8 b d 2 3.0
270 9 b e 3 3.0
271 """
273 def _merger(x, y):
274 # perform the ordered merge operation
275 op = _OrderedMerge(
276 x,
277 y,
278 on=on,
279 left_on=left_on,
280 right_on=right_on,
281 suffixes=suffixes,
282 fill_method=fill_method,
283 how=how,
284 )
285 return op.get_result()
287 if left_by is not None and right_by is not None:
288 raise ValueError("Can only group either left or right frames")
289 elif left_by is not None:
290 result, _ = _groupby_and_merge(
291 left_by, on, left, right, lambda x, y: _merger(x, y), check_duplicates=False
292 )
293 elif right_by is not None:
294 result, _ = _groupby_and_merge(
295 right_by,
296 on,
297 right,
298 left,
299 lambda x, y: _merger(y, x),
300 check_duplicates=False,
301 )
302 else:
303 result = _merger(left, right)
304 return result
307def merge_asof(
308 left,
309 right,
310 on=None,
311 left_on=None,
312 right_on=None,
313 left_index: bool = False,
314 right_index: bool = False,
315 by=None,
316 left_by=None,
317 right_by=None,
318 suffixes=("_x", "_y"),
319 tolerance=None,
320 allow_exact_matches: bool = True,
321 direction: str = "backward",
322) -> "DataFrame":
323 """
324 Perform an asof merge. This is similar to a left-join except that we
325 match on nearest key rather than equal keys.
327 Both DataFrames must be sorted by the key.
329 For each row in the left DataFrame:
331 - A "backward" search selects the last row in the right DataFrame whose
332 'on' key is less than or equal to the left's key.
334 - A "forward" search selects the first row in the right DataFrame whose
335 'on' key is greater than or equal to the left's key.
337 - A "nearest" search selects the row in the right DataFrame whose 'on'
338 key is closest in absolute distance to the left's key.
340 The default is "backward" and is compatible in versions below 0.20.0.
341 The direction parameter was added in version 0.20.0 and introduces
342 "forward" and "nearest".
344 Optionally match on equivalent keys with 'by' before searching with 'on'.
346 Parameters
347 ----------
348 left : DataFrame
349 right : DataFrame
350 on : label
351 Field name to join on. Must be found in both DataFrames.
352 The data MUST be ordered. Furthermore this must be a numeric column,
353 such as datetimelike, integer, or float. On or left_on/right_on
354 must be given.
355 left_on : label
356 Field name to join on in left DataFrame.
357 right_on : label
358 Field name to join on in right DataFrame.
359 left_index : bool
360 Use the index of the left DataFrame as the join key.
361 right_index : bool
362 Use the index of the right DataFrame as the join key.
363 by : column name or list of column names
364 Match on these columns before performing merge operation.
365 left_by : column name
366 Field names to match on in the left DataFrame.
367 right_by : column name
368 Field names to match on in the right DataFrame.
369 suffixes : 2-length sequence (tuple, list, ...)
370 Suffix to apply to overlapping column names in the left and right
371 side, respectively.
372 tolerance : int or Timedelta, optional, default None
373 Select asof tolerance within this range; must be compatible
374 with the merge index.
375 allow_exact_matches : bool, default True
377 - If True, allow matching with the same 'on' value
378 (i.e. less-than-or-equal-to / greater-than-or-equal-to)
379 - If False, don't match the same 'on' value
380 (i.e., strictly less-than / strictly greater-than).
382 direction : 'backward' (default), 'forward', or 'nearest'
383 Whether to search for prior, subsequent, or closest matches.
385 Returns
386 -------
387 merged : DataFrame
389 See Also
390 --------
391 merge
392 merge_ordered
394 Examples
395 --------
396 >>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']})
397 >>> left
398 a left_val
399 0 1 a
400 1 5 b
401 2 10 c
403 >>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
404 ... 'right_val': [1, 2, 3, 6, 7]})
405 >>> right
406 a right_val
407 0 1 1
408 1 2 2
409 2 3 3
410 3 6 6
411 4 7 7
413 >>> pd.merge_asof(left, right, on='a')
414 a left_val right_val
415 0 1 a 1
416 1 5 b 3
417 2 10 c 7
419 >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False)
420 a left_val right_val
421 0 1 a NaN
422 1 5 b 3.0
423 2 10 c 7.0
425 >>> pd.merge_asof(left, right, on='a', direction='forward')
426 a left_val right_val
427 0 1 a 1.0
428 1 5 b 6.0
429 2 10 c NaN
431 >>> pd.merge_asof(left, right, on='a', direction='nearest')
432 a left_val right_val
433 0 1 a 1
434 1 5 b 6
435 2 10 c 7
437 We can use indexed DataFrames as well.
439 >>> left = pd.DataFrame({'left_val': ['a', 'b', 'c']}, index=[1, 5, 10])
440 >>> left
441 left_val
442 1 a
443 5 b
444 10 c
446 >>> right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7]},
447 ... index=[1, 2, 3, 6, 7])
448 >>> right
449 right_val
450 1 1
451 2 2
452 3 3
453 6 6
454 7 7
456 >>> pd.merge_asof(left, right, left_index=True, right_index=True)
457 left_val right_val
458 1 a 1
459 5 b 3
460 10 c 7
462 Here is a real-world times-series example
464 >>> quotes
465 time ticker bid ask
466 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93
467 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96
468 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98
469 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00
470 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93
471 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01
472 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88
473 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03
475 >>> trades
476 time ticker price quantity
477 0 2016-05-25 13:30:00.023 MSFT 51.95 75
478 1 2016-05-25 13:30:00.038 MSFT 51.95 155
479 2 2016-05-25 13:30:00.048 GOOG 720.77 100
480 3 2016-05-25 13:30:00.048 GOOG 720.92 100
481 4 2016-05-25 13:30:00.048 AAPL 98.00 100
483 By default we are taking the asof of the quotes
485 >>> pd.merge_asof(trades, quotes,
486 ... on='time',
487 ... by='ticker')
488 time ticker price quantity bid ask
489 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
490 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
491 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
492 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
493 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
495 We only asof within 2ms between the quote time and the trade time
497 >>> pd.merge_asof(trades, quotes,
498 ... on='time',
499 ... by='ticker',
500 ... tolerance=pd.Timedelta('2ms'))
501 time ticker price quantity bid ask
502 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
503 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN
504 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
505 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
506 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
508 We only asof within 10ms between the quote time and the trade time
509 and we exclude exact matches on time. However *prior* data will
510 propagate forward
512 >>> pd.merge_asof(trades, quotes,
513 ... on='time',
514 ... by='ticker',
515 ... tolerance=pd.Timedelta('10ms'),
516 ... allow_exact_matches=False)
517 time ticker price quantity bid ask
518 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN
519 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
520 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN
521 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN
522 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
523 """
524 op = _AsOfMerge(
525 left,
526 right,
527 on=on,
528 left_on=left_on,
529 right_on=right_on,
530 left_index=left_index,
531 right_index=right_index,
532 by=by,
533 left_by=left_by,
534 right_by=right_by,
535 suffixes=suffixes,
536 how="asof",
537 tolerance=tolerance,
538 allow_exact_matches=allow_exact_matches,
539 direction=direction,
540 )
541 return op.get_result()
544# TODO: transformations??
545# TODO: only copy DataFrames when modification necessary
546class _MergeOperation:
547 """
548 Perform a database (SQL) merge operation between two DataFrame or Series
549 objects using either columns as keys or their row indexes
550 """
552 _merge_type = "merge"
554 def __init__(
555 self,
556 left: Union["Series", "DataFrame"],
557 right: Union["Series", "DataFrame"],
558 how: str = "inner",
559 on=None,
560 left_on=None,
561 right_on=None,
562 axis=1,
563 left_index: bool = False,
564 right_index: bool = False,
565 sort: bool = True,
566 suffixes=("_x", "_y"),
567 copy: bool = True,
568 indicator: bool = False,
569 validate=None,
570 ):
571 _left = _validate_operand(left)
572 _right = _validate_operand(right)
573 self.left = self.orig_left = _left
574 self.right = self.orig_right = _right
575 self.how = how
576 self.axis = axis
578 self.on = com.maybe_make_list(on)
579 self.left_on = com.maybe_make_list(left_on)
580 self.right_on = com.maybe_make_list(right_on)
582 self.copy = copy
583 self.suffixes = suffixes
584 self.sort = sort
586 self.left_index = left_index
587 self.right_index = right_index
589 self.indicator = indicator
591 self.indicator_name: Optional[str]
592 if isinstance(self.indicator, str):
593 self.indicator_name = self.indicator
594 elif isinstance(self.indicator, bool):
595 self.indicator_name = "_merge" if self.indicator else None
596 else:
597 raise ValueError(
598 "indicator option can only accept boolean or string arguments"
599 )
601 if not is_bool(left_index):
602 raise ValueError(
603 "left_index parameter must be of type bool, not "
604 "{left_index}".format(left_index=type(left_index))
605 )
606 if not is_bool(right_index):
607 raise ValueError(
608 "right_index parameter must be of type bool, not "
609 "{right_index}".format(right_index=type(right_index))
610 )
612 # warn user when merging between different levels
613 if _left.columns.nlevels != _right.columns.nlevels:
614 msg = (
615 "merging between different levels can give an unintended "
616 "result ({left} levels on the left, {right} on the right)"
617 ).format(left=_left.columns.nlevels, right=_right.columns.nlevels)
618 warnings.warn(msg, UserWarning)
620 self._validate_specification()
622 # note this function has side effects
623 (
624 self.left_join_keys,
625 self.right_join_keys,
626 self.join_names,
627 ) = self._get_merge_keys()
629 # validate the merge keys dtypes. We may need to coerce
630 # to avoid incompat dtypes
631 self._maybe_coerce_merge_keys()
633 # If argument passed to validate,
634 # check if columns specified as unique
635 # are in fact unique.
636 if validate is not None:
637 self._validate(validate)
639 def get_result(self):
640 if self.indicator:
641 self.left, self.right = self._indicator_pre_merge(self.left, self.right)
643 join_index, left_indexer, right_indexer = self._get_join_info()
645 ldata, rdata = self.left._data, self.right._data
646 lsuf, rsuf = self.suffixes
648 llabels, rlabels = _items_overlap_with_suffix(
649 ldata.items, lsuf, rdata.items, rsuf
650 )
652 lindexers = {1: left_indexer} if left_indexer is not None else {}
653 rindexers = {1: right_indexer} if right_indexer is not None else {}
655 result_data = concatenate_block_managers(
656 [(ldata, lindexers), (rdata, rindexers)],
657 axes=[llabels.append(rlabels), join_index],
658 concat_axis=0,
659 copy=self.copy,
660 )
662 typ = self.left._constructor
663 result = typ(result_data).__finalize__(self, method=self._merge_type)
665 if self.indicator:
666 result = self._indicator_post_merge(result)
668 self._maybe_add_join_keys(result, left_indexer, right_indexer)
670 self._maybe_restore_index_levels(result)
672 return result
674 def _indicator_pre_merge(
675 self, left: "DataFrame", right: "DataFrame"
676 ) -> Tuple["DataFrame", "DataFrame"]:
678 columns = left.columns.union(right.columns)
680 for i in ["_left_indicator", "_right_indicator"]:
681 if i in columns:
682 raise ValueError(
683 "Cannot use `indicator=True` option when "
684 "data contains a column named {name}".format(name=i)
685 )
686 if self.indicator_name in columns:
687 raise ValueError(
688 "Cannot use name of an existing column for indicator column"
689 )
691 left = left.copy()
692 right = right.copy()
694 left["_left_indicator"] = 1
695 left["_left_indicator"] = left["_left_indicator"].astype("int8")
697 right["_right_indicator"] = 2
698 right["_right_indicator"] = right["_right_indicator"].astype("int8")
700 return left, right
702 def _indicator_post_merge(self, result):
704 result["_left_indicator"] = result["_left_indicator"].fillna(0)
705 result["_right_indicator"] = result["_right_indicator"].fillna(0)
707 result[self.indicator_name] = Categorical(
708 (result["_left_indicator"] + result["_right_indicator"]),
709 categories=[1, 2, 3],
710 )
711 result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(
712 ["left_only", "right_only", "both"]
713 )
715 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
716 return result
718 def _maybe_restore_index_levels(self, result):
719 """
720 Restore index levels specified as `on` parameters
722 Here we check for cases where `self.left_on` and `self.right_on` pairs
723 each reference an index level in their respective DataFrames. The
724 joined columns corresponding to these pairs are then restored to the
725 index of `result`.
727 **Note:** This method has side effects. It modifies `result` in-place
729 Parameters
730 ----------
731 result: DataFrame
732 merge result
734 Returns
735 -------
736 None
737 """
738 names_to_restore = []
739 for name, left_key, right_key in zip(
740 self.join_names, self.left_on, self.right_on
741 ):
742 if (
743 self.orig_left._is_level_reference(left_key)
744 and self.orig_right._is_level_reference(right_key)
745 and name not in result.index.names
746 ):
748 names_to_restore.append(name)
750 if names_to_restore:
751 result.set_index(names_to_restore, inplace=True)
753 def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
755 left_has_missing = None
756 right_has_missing = None
758 keys = zip(self.join_names, self.left_on, self.right_on)
759 for i, (name, lname, rname) in enumerate(keys):
760 if not _should_fill(lname, rname):
761 continue
763 take_left, take_right = None, None
765 if name in result:
767 if left_indexer is not None and right_indexer is not None:
768 if name in self.left:
770 if left_has_missing is None:
771 left_has_missing = (left_indexer == -1).any()
773 if left_has_missing:
774 take_right = self.right_join_keys[i]
776 if not is_dtype_equal(
777 result[name].dtype, self.left[name].dtype
778 ):
779 take_left = self.left[name]._values
781 elif name in self.right:
783 if right_has_missing is None:
784 right_has_missing = (right_indexer == -1).any()
786 if right_has_missing:
787 take_left = self.left_join_keys[i]
789 if not is_dtype_equal(
790 result[name].dtype, self.right[name].dtype
791 ):
792 take_right = self.right[name]._values
794 elif left_indexer is not None and is_array_like(self.left_join_keys[i]):
795 take_left = self.left_join_keys[i]
796 take_right = self.right_join_keys[i]
798 if take_left is not None or take_right is not None:
800 if take_left is None:
801 lvals = result[name]._values
802 else:
803 lfill = na_value_for_dtype(take_left.dtype)
804 lvals = algos.take_1d(take_left, left_indexer, fill_value=lfill)
806 if take_right is None:
807 rvals = result[name]._values
808 else:
809 rfill = na_value_for_dtype(take_right.dtype)
810 rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill)
812 # if we have an all missing left_indexer
813 # make sure to just use the right values
814 mask = left_indexer == -1
815 if mask.all():
816 key_col = rvals
817 else:
818 key_col = Index(lvals).where(~mask, rvals)
820 if result._is_label_reference(name):
821 result[name] = key_col
822 elif result._is_level_reference(name):
823 if isinstance(result.index, MultiIndex):
824 key_col.name = name
825 idx_list = [
826 result.index.get_level_values(level_name)
827 if level_name != name
828 else key_col
829 for level_name in result.index.names
830 ]
832 result.set_index(idx_list, inplace=True)
833 else:
834 result.index = Index(key_col, name=name)
835 else:
836 result.insert(i, name or "key_{i}".format(i=i), key_col)
838 def _get_join_indexers(self):
839 """ return the join indexers """
840 return _get_join_indexers(
841 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
842 )
844 def _get_join_info(self):
845 left_ax = self.left._data.axes[self.axis]
846 right_ax = self.right._data.axes[self.axis]
848 if self.left_index and self.right_index and self.how != "asof":
849 join_index, left_indexer, right_indexer = left_ax.join(
850 right_ax, how=self.how, return_indexers=True, sort=self.sort
851 )
852 elif self.right_index and self.how == "left":
853 join_index, left_indexer, right_indexer = _left_join_on_index(
854 left_ax, right_ax, self.left_join_keys, sort=self.sort
855 )
857 elif self.left_index and self.how == "right":
858 join_index, right_indexer, left_indexer = _left_join_on_index(
859 right_ax, left_ax, self.right_join_keys, sort=self.sort
860 )
861 else:
862 (left_indexer, right_indexer) = self._get_join_indexers()
864 if self.right_index:
865 if len(self.left) > 0:
866 join_index = self._create_join_index(
867 self.left.index,
868 self.right.index,
869 left_indexer,
870 right_indexer,
871 how="right",
872 )
873 else:
874 join_index = self.right.index.take(right_indexer)
875 left_indexer = np.array([-1] * len(join_index))
876 elif self.left_index:
877 if len(self.right) > 0:
878 join_index = self._create_join_index(
879 self.right.index,
880 self.left.index,
881 right_indexer,
882 left_indexer,
883 how="left",
884 )
885 else:
886 join_index = self.left.index.take(left_indexer)
887 right_indexer = np.array([-1] * len(join_index))
888 else:
889 join_index = Index(np.arange(len(left_indexer)))
891 if len(join_index) == 0:
892 join_index = join_index.astype(object)
893 return join_index, left_indexer, right_indexer
895 def _create_join_index(
896 self,
897 index: Index,
898 other_index: Index,
899 indexer,
900 other_indexer,
901 how: str = "left",
902 ):
903 """
904 Create a join index by rearranging one index to match another
906 Parameters
907 ----------
908 index: Index being rearranged
909 other_index: Index used to supply values not found in index
910 indexer: how to rearrange index
911 how: replacement is only necessary if indexer based on other_index
913 Returns
914 -------
915 join_index
916 """
917 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
918 # if final index requires values in other_index but not target
919 # index, indexer may hold missing (-1) values, causing Index.take
920 # to take the final value in target index. So, we set the last
921 # element to be the desired fill value. We do not use allow_fill
922 # and fill_value because it throws a ValueError on integer indices
923 mask = indexer == -1
924 if np.any(mask):
925 fill_value = na_value_for_dtype(index.dtype, compat=False)
926 index = index.append(Index([fill_value]))
927 return index.take(indexer)
929 def _get_merge_keys(self):
930 """
931 Note: has side effects (copy/delete key columns)
933 Parameters
934 ----------
935 left
936 right
937 on
939 Returns
940 -------
941 left_keys, right_keys
942 """
943 left_keys = []
944 right_keys = []
945 join_names = []
946 right_drop = []
947 left_drop = []
949 left, right = self.left, self.right
951 is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
952 is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
954 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
955 # user could, for example, request 'left_index' and 'left_by'. In a
956 # regular pd.merge(), users cannot specify both 'left_index' and
957 # 'left_on'. (Instead, users have a MultiIndex). That means the
958 # self.left_on in this function is always empty in a pd.merge(), but
959 # a pd.merge_asof(left_index=True, left_by=...) will result in a
960 # self.left_on array with a None in the middle of it. This requires
961 # a work-around as designated in the code below.
962 # See _validate_specification() for where this happens.
964 # ugh, spaghetti re #733
965 if _any(self.left_on) and _any(self.right_on):
966 for lk, rk in zip(self.left_on, self.right_on):
967 if is_lkey(lk):
968 left_keys.append(lk)
969 if is_rkey(rk):
970 right_keys.append(rk)
971 join_names.append(None) # what to do?
972 else:
973 if rk is not None:
974 right_keys.append(right._get_label_or_level_values(rk))
975 join_names.append(rk)
976 else:
977 # work-around for merge_asof(right_index=True)
978 right_keys.append(right.index)
979 join_names.append(right.index.name)
980 else:
981 if not is_rkey(rk):
982 if rk is not None:
983 right_keys.append(right._get_label_or_level_values(rk))
984 else:
985 # work-around for merge_asof(right_index=True)
986 right_keys.append(right.index)
987 if lk is not None and lk == rk:
988 # avoid key upcast in corner case (length-0)
989 if len(left) > 0:
990 right_drop.append(rk)
991 else:
992 left_drop.append(lk)
993 else:
994 right_keys.append(rk)
995 if lk is not None:
996 left_keys.append(left._get_label_or_level_values(lk))
997 join_names.append(lk)
998 else:
999 # work-around for merge_asof(left_index=True)
1000 left_keys.append(left.index)
1001 join_names.append(left.index.name)
1002 elif _any(self.left_on):
1003 for k in self.left_on:
1004 if is_lkey(k):
1005 left_keys.append(k)
1006 join_names.append(None)
1007 else:
1008 left_keys.append(left._get_label_or_level_values(k))
1009 join_names.append(k)
1010 if isinstance(self.right.index, MultiIndex):
1011 right_keys = [
1012 lev._values.take(lev_codes)
1013 for lev, lev_codes in zip(
1014 self.right.index.levels, self.right.index.codes
1015 )
1016 ]
1017 else:
1018 right_keys = [self.right.index._values]
1019 elif _any(self.right_on):
1020 for k in self.right_on:
1021 if is_rkey(k):
1022 right_keys.append(k)
1023 join_names.append(None)
1024 else:
1025 right_keys.append(right._get_label_or_level_values(k))
1026 join_names.append(k)
1027 if isinstance(self.left.index, MultiIndex):
1028 left_keys = [
1029 lev._values.take(lev_codes)
1030 for lev, lev_codes in zip(
1031 self.left.index.levels, self.left.index.codes
1032 )
1033 ]
1034 else:
1035 left_keys = [self.left.index._values]
1037 if left_drop:
1038 self.left = self.left._drop_labels_or_levels(left_drop)
1040 if right_drop:
1041 self.right = self.right._drop_labels_or_levels(right_drop)
1043 return left_keys, right_keys, join_names
1045 def _maybe_coerce_merge_keys(self):
1046 # we have valid mergees but we may have to further
1047 # coerce these if they are originally incompatible types
1048 #
1049 # for example if these are categorical, but are not dtype_equal
1050 # or if we have object and integer dtypes
1052 for lk, rk, name in zip(
1053 self.left_join_keys, self.right_join_keys, self.join_names
1054 ):
1055 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
1056 continue
1058 lk_is_cat = is_categorical_dtype(lk)
1059 rk_is_cat = is_categorical_dtype(rk)
1060 lk_is_object = is_object_dtype(lk)
1061 rk_is_object = is_object_dtype(rk)
1063 # if either left or right is a categorical
1064 # then the must match exactly in categories & ordered
1065 if lk_is_cat and rk_is_cat:
1066 if lk.is_dtype_equal(rk):
1067 continue
1069 elif lk_is_cat or rk_is_cat:
1070 pass
1072 elif is_dtype_equal(lk.dtype, rk.dtype):
1073 continue
1075 msg = (
1076 "You are trying to merge on {lk_dtype} and "
1077 "{rk_dtype} columns. If you wish to proceed "
1078 "you should use pd.concat".format(lk_dtype=lk.dtype, rk_dtype=rk.dtype)
1079 )
1081 # if we are numeric, then allow differing
1082 # kinds to proceed, eg. int64 and int8, int and float
1083 # further if we are object, but we infer to
1084 # the same, then proceed
1085 if is_numeric_dtype(lk) and is_numeric_dtype(rk):
1086 if lk.dtype.kind == rk.dtype.kind:
1087 continue
1089 # check whether ints and floats
1090 elif is_integer_dtype(rk) and is_float_dtype(lk):
1091 if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
1092 warnings.warn(
1093 "You are merging on int and float "
1094 "columns where the float values "
1095 "are not equal to their int "
1096 "representation",
1097 UserWarning,
1098 )
1099 continue
1101 elif is_float_dtype(rk) and is_integer_dtype(lk):
1102 if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():
1103 warnings.warn(
1104 "You are merging on int and float "
1105 "columns where the float values "
1106 "are not equal to their int "
1107 "representation",
1108 UserWarning,
1109 )
1110 continue
1112 # let's infer and see if we are ok
1113 elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(
1114 rk, skipna=False
1115 ):
1116 continue
1118 # Check if we are trying to merge on obviously
1119 # incompatible dtypes GH 9780, GH 15800
1121 # bool values are coerced to object
1122 elif (lk_is_object and is_bool_dtype(rk)) or (
1123 is_bool_dtype(lk) and rk_is_object
1124 ):
1125 pass
1127 # object values are allowed to be merged
1128 elif (lk_is_object and is_numeric_dtype(rk)) or (
1129 is_numeric_dtype(lk) and rk_is_object
1130 ):
1131 inferred_left = lib.infer_dtype(lk, skipna=False)
1132 inferred_right = lib.infer_dtype(rk, skipna=False)
1133 bool_types = ["integer", "mixed-integer", "boolean", "empty"]
1134 string_types = ["string", "unicode", "mixed", "bytes", "empty"]
1136 # inferred bool
1137 if inferred_left in bool_types and inferred_right in bool_types:
1138 pass
1140 # unless we are merging non-string-like with string-like
1141 elif (
1142 inferred_left in string_types and inferred_right not in string_types
1143 ) or (
1144 inferred_right in string_types and inferred_left not in string_types
1145 ):
1146 raise ValueError(msg)
1148 # datetimelikes must match exactly
1149 elif needs_i8_conversion(lk) and not needs_i8_conversion(rk):
1150 raise ValueError(msg)
1151 elif not needs_i8_conversion(lk) and needs_i8_conversion(rk):
1152 raise ValueError(msg)
1153 elif is_datetime64tz_dtype(lk) and not is_datetime64tz_dtype(rk):
1154 raise ValueError(msg)
1155 elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
1156 raise ValueError(msg)
1158 elif lk_is_object and rk_is_object:
1159 continue
1161 # Houston, we have a problem!
1162 # let's coerce to object if the dtypes aren't
1163 # categorical, otherwise coerce to the category
1164 # dtype. If we coerced categories to object,
1165 # then we would lose type information on some
1166 # columns, and end up trying to merge
1167 # incompatible dtypes. See GH 16900.
1168 if name in self.left.columns:
1169 typ = lk.categories.dtype if lk_is_cat else object
1170 self.left = self.left.assign(**{name: self.left[name].astype(typ)})
1171 if name in self.right.columns:
1172 typ = rk.categories.dtype if rk_is_cat else object
1173 self.right = self.right.assign(**{name: self.right[name].astype(typ)})
1175 def _validate_specification(self):
1176 # Hm, any way to make this logic less complicated??
1177 if self.on is None and self.left_on is None and self.right_on is None:
1179 if self.left_index and self.right_index:
1180 self.left_on, self.right_on = (), ()
1181 elif self.left_index:
1182 if self.right_on is None:
1183 raise MergeError("Must pass right_on or right_index=True")
1184 elif self.right_index:
1185 if self.left_on is None:
1186 raise MergeError("Must pass left_on or left_index=True")
1187 else:
1188 # use the common columns
1189 common_cols = self.left.columns.intersection(self.right.columns)
1190 if len(common_cols) == 0:
1191 raise MergeError(
1192 "No common columns to perform merge on. "
1193 "Merge options: left_on={lon}, right_on={ron}, "
1194 "left_index={lidx}, right_index={ridx}".format(
1195 lon=self.left_on,
1196 ron=self.right_on,
1197 lidx=self.left_index,
1198 ridx=self.right_index,
1199 )
1200 )
1201 if not common_cols.is_unique:
1202 raise MergeError(f"Data columns not unique: {repr(common_cols)}")
1203 self.left_on = self.right_on = common_cols
1204 elif self.on is not None:
1205 if self.left_on is not None or self.right_on is not None:
1206 raise MergeError(
1207 'Can only pass argument "on" OR "left_on" '
1208 'and "right_on", not a combination of both.'
1209 )
1210 self.left_on = self.right_on = self.on
1211 elif self.left_on is not None:
1212 n = len(self.left_on)
1213 if self.right_index:
1214 if len(self.left_on) != self.right.index.nlevels:
1215 raise ValueError(
1216 "len(left_on) must equal the number "
1217 'of levels in the index of "right"'
1218 )
1219 self.right_on = [None] * n
1220 elif self.right_on is not None:
1221 n = len(self.right_on)
1222 if self.left_index:
1223 if len(self.right_on) != self.left.index.nlevels:
1224 raise ValueError(
1225 "len(right_on) must equal the number "
1226 'of levels in the index of "left"'
1227 )
1228 self.left_on = [None] * n
1229 if len(self.right_on) != len(self.left_on):
1230 raise ValueError("len(right_on) must equal len(left_on)")
1232 def _validate(self, validate: str):
1234 # Check uniqueness of each
1235 if self.left_index:
1236 left_unique = self.orig_left.index.is_unique
1237 else:
1238 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
1240 if self.right_index:
1241 right_unique = self.orig_right.index.is_unique
1242 else:
1243 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
1245 # Check data integrity
1246 if validate in ["one_to_one", "1:1"]:
1247 if not left_unique and not right_unique:
1248 raise MergeError(
1249 "Merge keys are not unique in either left "
1250 "or right dataset; not a one-to-one merge"
1251 )
1252 elif not left_unique:
1253 raise MergeError(
1254 "Merge keys are not unique in left dataset; "
1255 "not a one-to-one merge"
1256 )
1257 elif not right_unique:
1258 raise MergeError(
1259 "Merge keys are not unique in right dataset; "
1260 "not a one-to-one merge"
1261 )
1263 elif validate in ["one_to_many", "1:m"]:
1264 if not left_unique:
1265 raise MergeError(
1266 "Merge keys are not unique in left dataset; "
1267 "not a one-to-many merge"
1268 )
1270 elif validate in ["many_to_one", "m:1"]:
1271 if not right_unique:
1272 raise MergeError(
1273 "Merge keys are not unique in right dataset; "
1274 "not a many-to-one merge"
1275 )
1277 elif validate in ["many_to_many", "m:m"]:
1278 pass
1280 else:
1281 raise ValueError("Not a valid argument for validate")
1284def _get_join_indexers(
1285 left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs
1286):
1287 """
1289 Parameters
1290 ----------
1291 left_keys: ndarray, Index, Series
1292 right_keys: ndarray, Index, Series
1293 sort: bool, default False
1294 how: string {'inner', 'outer', 'left', 'right'}, default 'inner'
1296 Returns
1297 -------
1298 tuple of (left_indexer, right_indexer)
1299 indexers into the left_keys, right_keys
1301 """
1302 assert len(left_keys) == len(
1303 right_keys
1304 ), "left_key and right_keys must be the same length"
1306 # get left & right join labels and num. of levels at each location
1307 mapped = (
1308 _factorize_keys(left_keys[n], right_keys[n], sort=sort)
1309 for n in range(len(left_keys))
1310 )
1311 zipped = zip(*mapped)
1312 llab, rlab, shape = [list(x) for x in zipped]
1314 # get flat i8 keys from label lists
1315 lkey, rkey = _get_join_keys(llab, rlab, shape, sort)
1317 # factorize keys to a dense i8 space
1318 # `count` is the num. of unique keys
1319 # set(lkey) | set(rkey) == range(count)
1320 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
1322 # preserve left frame order if how == 'left' and sort == False
1323 kwargs = copy.copy(kwargs)
1324 if how == "left":
1325 kwargs["sort"] = sort
1326 join_func = _join_functions[how]
1328 return join_func(lkey, rkey, count, **kwargs)
1331def _restore_dropped_levels_multijoin(
1332 left: MultiIndex,
1333 right: MultiIndex,
1334 dropped_level_names,
1335 join_index,
1336 lindexer,
1337 rindexer,
1338):
1339 """
1340 *this is an internal non-public method*
1342 Returns the levels, labels and names of a multi-index to multi-index join.
1343 Depending on the type of join, this method restores the appropriate
1344 dropped levels of the joined multi-index.
1345 The method relies on lidx, rindexer which hold the index positions of
1346 left and right, where a join was feasible
1348 Parameters
1349 ----------
1350 left : MultiIndex
1351 left index
1352 right : MultiIndex
1353 right index
1354 dropped_level_names : str array
1355 list of non-common level names
1356 join_index : MultiIndex
1357 the index of the join between the
1358 common levels of left and right
1359 lindexer : intp array
1360 left indexer
1361 rindexer : intp array
1362 right indexer
1364 Returns
1365 -------
1366 levels : list of Index
1367 levels of combined multiindexes
1368 labels : intp array
1369 labels of combined multiindexes
1370 names : str array
1371 names of combined multiindexes
1373 """
1375 def _convert_to_mulitindex(index) -> MultiIndex:
1376 if isinstance(index, MultiIndex):
1377 return index
1378 else:
1379 return MultiIndex.from_arrays([index.values], names=[index.name])
1381 # For multi-multi joins with one overlapping level,
1382 # the returned index if of type Index
1383 # Assure that join_index is of type MultiIndex
1384 # so that dropped levels can be appended
1385 join_index = _convert_to_mulitindex(join_index)
1387 join_levels = join_index.levels
1388 join_codes = join_index.codes
1389 join_names = join_index.names
1391 # lindexer and rindexer hold the indexes where the join occurred
1392 # for left and right respectively. If left/right is None then
1393 # the join occurred on all indices of left/right
1394 if lindexer is None:
1395 lindexer = range(left.size)
1397 if rindexer is None:
1398 rindexer = range(right.size)
1400 # Iterate through the levels that must be restored
1401 for dropped_level_name in dropped_level_names:
1402 if dropped_level_name in left.names:
1403 idx = left
1404 indexer = lindexer
1405 else:
1406 idx = right
1407 indexer = rindexer
1409 # The index of the level name to be restored
1410 name_idx = idx.names.index(dropped_level_name)
1412 restore_levels = idx.levels[name_idx]
1413 # Inject -1 in the codes list where a join was not possible
1414 # IOW indexer[i]=-1
1415 codes = idx.codes[name_idx]
1416 restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
1418 join_levels = join_levels + [restore_levels]
1419 join_codes = join_codes + [restore_codes]
1420 join_names = join_names + [dropped_level_name]
1422 return join_levels, join_codes, join_names
1425class _OrderedMerge(_MergeOperation):
1426 _merge_type = "ordered_merge"
1428 def __init__(
1429 self,
1430 left,
1431 right,
1432 on=None,
1433 left_on=None,
1434 right_on=None,
1435 left_index: bool = False,
1436 right_index: bool = False,
1437 axis=1,
1438 suffixes=("_x", "_y"),
1439 copy: bool = True,
1440 fill_method=None,
1441 how: str = "outer",
1442 ):
1444 self.fill_method = fill_method
1445 _MergeOperation.__init__(
1446 self,
1447 left,
1448 right,
1449 on=on,
1450 left_on=left_on,
1451 left_index=left_index,
1452 right_index=right_index,
1453 right_on=right_on,
1454 axis=axis,
1455 how=how,
1456 suffixes=suffixes,
1457 sort=True, # factorize sorts
1458 )
1460 def get_result(self):
1461 join_index, left_indexer, right_indexer = self._get_join_info()
1463 # this is a bit kludgy
1464 ldata, rdata = self.left._data, self.right._data
1465 lsuf, rsuf = self.suffixes
1467 llabels, rlabels = _items_overlap_with_suffix(
1468 ldata.items, lsuf, rdata.items, rsuf
1469 )
1471 if self.fill_method == "ffill":
1472 left_join_indexer = libjoin.ffill_indexer(left_indexer)
1473 right_join_indexer = libjoin.ffill_indexer(right_indexer)
1474 else:
1475 left_join_indexer = left_indexer
1476 right_join_indexer = right_indexer
1478 lindexers = {1: left_join_indexer} if left_join_indexer is not None else {}
1479 rindexers = {1: right_join_indexer} if right_join_indexer is not None else {}
1481 result_data = concatenate_block_managers(
1482 [(ldata, lindexers), (rdata, rindexers)],
1483 axes=[llabels.append(rlabels), join_index],
1484 concat_axis=0,
1485 copy=self.copy,
1486 )
1488 typ = self.left._constructor
1489 result = typ(result_data).__finalize__(self, method=self._merge_type)
1491 self._maybe_add_join_keys(result, left_indexer, right_indexer)
1493 return result
1496def _asof_function(direction: str):
1497 name = "asof_join_{dir}".format(dir=direction)
1498 return getattr(libjoin, name, None)
1501def _asof_by_function(direction: str):
1502 name = "asof_join_{dir}_on_X_by_Y".format(dir=direction)
1503 return getattr(libjoin, name, None)
1506_type_casters = {
1507 "int64_t": ensure_int64,
1508 "double": ensure_float64,
1509 "object": ensure_object,
1510}
1513def _get_cython_type_upcast(dtype):
1514 """ Upcast a dtype to 'int64_t', 'double', or 'object' """
1515 if is_integer_dtype(dtype):
1516 return "int64_t"
1517 elif is_float_dtype(dtype):
1518 return "double"
1519 else:
1520 return "object"
1523class _AsOfMerge(_OrderedMerge):
1524 _merge_type = "asof_merge"
1526 def __init__(
1527 self,
1528 left,
1529 right,
1530 on=None,
1531 left_on=None,
1532 right_on=None,
1533 left_index: bool = False,
1534 right_index: bool = False,
1535 by=None,
1536 left_by=None,
1537 right_by=None,
1538 axis=1,
1539 suffixes=("_x", "_y"),
1540 copy: bool = True,
1541 fill_method=None,
1542 how: str = "asof",
1543 tolerance=None,
1544 allow_exact_matches: bool = True,
1545 direction: str = "backward",
1546 ):
1548 self.by = by
1549 self.left_by = left_by
1550 self.right_by = right_by
1551 self.tolerance = tolerance
1552 self.allow_exact_matches = allow_exact_matches
1553 self.direction = direction
1555 _OrderedMerge.__init__(
1556 self,
1557 left,
1558 right,
1559 on=on,
1560 left_on=left_on,
1561 right_on=right_on,
1562 left_index=left_index,
1563 right_index=right_index,
1564 axis=axis,
1565 how=how,
1566 suffixes=suffixes,
1567 fill_method=fill_method,
1568 )
1570 def _validate_specification(self):
1571 super()._validate_specification()
1573 # we only allow on to be a single item for on
1574 if len(self.left_on) != 1 and not self.left_index:
1575 raise MergeError("can only asof on a key for left")
1577 if len(self.right_on) != 1 and not self.right_index:
1578 raise MergeError("can only asof on a key for right")
1580 if self.left_index and isinstance(self.left.index, MultiIndex):
1581 raise MergeError("left can only have one index")
1583 if self.right_index and isinstance(self.right.index, MultiIndex):
1584 raise MergeError("right can only have one index")
1586 # set 'by' columns
1587 if self.by is not None:
1588 if self.left_by is not None or self.right_by is not None:
1589 raise MergeError("Can only pass by OR left_by and right_by")
1590 self.left_by = self.right_by = self.by
1591 if self.left_by is None and self.right_by is not None:
1592 raise MergeError("missing left_by")
1593 if self.left_by is not None and self.right_by is None:
1594 raise MergeError("missing right_by")
1596 # add 'by' to our key-list so we can have it in the
1597 # output as a key
1598 if self.left_by is not None:
1599 if not is_list_like(self.left_by):
1600 self.left_by = [self.left_by]
1601 if not is_list_like(self.right_by):
1602 self.right_by = [self.right_by]
1604 if len(self.left_by) != len(self.right_by):
1605 raise MergeError("left_by and right_by must be same length")
1607 self.left_on = self.left_by + list(self.left_on)
1608 self.right_on = self.right_by + list(self.right_on)
1610 # check 'direction' is valid
1611 if self.direction not in ["backward", "forward", "nearest"]:
1612 raise MergeError(
1613 "direction invalid: {direction}".format(direction=self.direction)
1614 )
1616 @property
1617 def _asof_key(self):
1618 """ This is our asof key, the 'on' """
1619 return self.left_on[-1]
1621 def _get_merge_keys(self):
1623 # note this function has side effects
1624 (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys()
1626 # validate index types are the same
1627 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
1628 if not is_dtype_equal(lk.dtype, rk.dtype):
1629 if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype):
1630 # The generic error message is confusing for categoricals.
1631 #
1632 # In this function, the join keys include both the original
1633 # ones of the merge_asof() call, and also the keys passed
1634 # to its by= argument. Unordered but equal categories
1635 # are not supported for the former, but will fail
1636 # later with a ValueError, so we don't *need* to check
1637 # for them here.
1638 msg = (
1639 "incompatible merge keys [{i}] {lkdtype} and "
1640 "{rkdtype}, both sides category, but not equal ones".format(
1641 i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype)
1642 )
1643 )
1644 else:
1645 msg = (
1646 "incompatible merge keys [{i}] {lkdtype} and "
1647 "{rkdtype}, must be the same type".format(
1648 i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype)
1649 )
1650 )
1651 raise MergeError(msg)
1653 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI
1654 if self.tolerance is not None:
1656 if self.left_index:
1657 lt = self.left.index
1658 else:
1659 lt = left_join_keys[-1]
1661 msg = (
1662 "incompatible tolerance {tolerance}, must be compat "
1663 "with type {lkdtype}".format(
1664 tolerance=type(self.tolerance), lkdtype=repr(lt.dtype)
1665 )
1666 )
1668 if needs_i8_conversion(lt):
1669 if not isinstance(self.tolerance, datetime.timedelta):
1670 raise MergeError(msg)
1671 if self.tolerance < Timedelta(0):
1672 raise MergeError("tolerance must be positive")
1674 elif is_integer_dtype(lt):
1675 if not is_integer(self.tolerance):
1676 raise MergeError(msg)
1677 if self.tolerance < 0:
1678 raise MergeError("tolerance must be positive")
1680 elif is_float_dtype(lt):
1681 if not is_number(self.tolerance):
1682 raise MergeError(msg)
1683 if self.tolerance < 0:
1684 raise MergeError("tolerance must be positive")
1686 else:
1687 raise MergeError("key must be integer, timestamp or float")
1689 # validate allow_exact_matches
1690 if not is_bool(self.allow_exact_matches):
1691 msg = "allow_exact_matches must be boolean, passed {passed}"
1692 raise MergeError(msg.format(passed=self.allow_exact_matches))
1694 return left_join_keys, right_join_keys, join_names
1696 def _get_join_indexers(self):
1697 """ return the join indexers """
1699 def flip(xs):
1700 """ unlike np.transpose, this returns an array of tuples """
1701 xs = [
1702 x if not is_extension_array_dtype(x) else x._ndarray_values for x in xs
1703 ]
1704 labels = list(string.ascii_lowercase[: len(xs)])
1705 dtypes = [x.dtype for x in xs]
1706 labeled_dtypes = list(zip(labels, dtypes))
1707 return np.array(list(zip(*xs)), labeled_dtypes)
1709 # values to compare
1710 left_values = (
1711 self.left.index.values if self.left_index else self.left_join_keys[-1]
1712 )
1713 right_values = (
1714 self.right.index.values if self.right_index else self.right_join_keys[-1]
1715 )
1716 tolerance = self.tolerance
1718 # we require sortedness and non-null values in the join keys
1719 msg_sorted = "{side} keys must be sorted"
1720 msg_missings = "Merge keys contain null values on {side} side"
1722 if not Index(left_values).is_monotonic:
1723 if isna(left_values).any():
1724 raise ValueError(msg_missings.format(side="left"))
1725 else:
1726 raise ValueError(msg_sorted.format(side="left"))
1728 if not Index(right_values).is_monotonic:
1729 if isna(right_values).any():
1730 raise ValueError(msg_missings.format(side="right"))
1731 else:
1732 raise ValueError(msg_sorted.format(side="right"))
1734 # initial type conversion as needed
1735 if needs_i8_conversion(left_values):
1736 left_values = left_values.view("i8")
1737 right_values = right_values.view("i8")
1738 if tolerance is not None:
1739 tolerance = Timedelta(tolerance)
1740 tolerance = tolerance.value
1742 # a "by" parameter requires special handling
1743 if self.left_by is not None:
1744 # remove 'on' parameter from values if one existed
1745 if self.left_index and self.right_index:
1746 left_by_values = self.left_join_keys
1747 right_by_values = self.right_join_keys
1748 else:
1749 left_by_values = self.left_join_keys[0:-1]
1750 right_by_values = self.right_join_keys[0:-1]
1752 # get tuple representation of values if more than one
1753 if len(left_by_values) == 1:
1754 left_by_values = left_by_values[0]
1755 right_by_values = right_by_values[0]
1756 else:
1757 left_by_values = flip(left_by_values)
1758 right_by_values = flip(right_by_values)
1760 # upcast 'by' parameter because HashTable is limited
1761 by_type = _get_cython_type_upcast(left_by_values.dtype)
1762 by_type_caster = _type_casters[by_type]
1763 left_by_values = by_type_caster(left_by_values)
1764 right_by_values = by_type_caster(right_by_values)
1766 # choose appropriate function by type
1767 func = _asof_by_function(self.direction)
1768 return func(
1769 left_values,
1770 right_values,
1771 left_by_values,
1772 right_by_values,
1773 self.allow_exact_matches,
1774 tolerance,
1775 )
1776 else:
1777 # choose appropriate function by type
1778 func = _asof_function(self.direction)
1779 return func(left_values, right_values, self.allow_exact_matches, tolerance)
1782def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool):
1784 # left & right join labels and num. of levels at each location
1785 mapped = (
1786 _factorize_keys(index.levels[n], join_keys[n], sort=sort)
1787 for n in range(index.nlevels)
1788 )
1789 zipped = zip(*mapped)
1790 rcodes, lcodes, shape = [list(x) for x in zipped]
1791 if sort:
1792 rcodes = list(map(np.take, rcodes, index.codes))
1793 else:
1794 i8copy = lambda a: a.astype("i8", subok=False, copy=True)
1795 rcodes = list(map(i8copy, index.codes))
1797 # fix right labels if there were any nulls
1798 for i in range(len(join_keys)):
1799 mask = index.codes[i] == -1
1800 if mask.any():
1801 # check if there already was any nulls at this location
1802 # if there was, it is factorized to `shape[i] - 1`
1803 a = join_keys[i][lcodes[i] == shape[i] - 1]
1804 if a.size == 0 or not a[0] != a[0]:
1805 shape[i] += 1
1807 rcodes[i][mask] = shape[i] - 1
1809 # get flat i8 join keys
1810 lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort)
1812 # factorize keys to a dense i8 space
1813 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
1815 return libjoin.left_outer_join(lkey, rkey, count, sort=sort)
1818def _get_single_indexer(join_key, index, sort: bool = False):
1819 left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)
1821 left_indexer, right_indexer = libjoin.left_outer_join(
1822 ensure_int64(left_key), ensure_int64(right_key), count, sort=sort
1823 )
1825 return left_indexer, right_indexer
1828def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False):
1829 if len(join_keys) > 1:
1830 if not (
1831 (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels)
1832 ):
1833 raise AssertionError(
1834 "If more than one join key is given then "
1835 "'right_ax' must be a MultiIndex and the "
1836 "number of join keys must be the number of "
1837 "levels in right_ax"
1838 )
1840 left_indexer, right_indexer = _get_multiindex_indexer(
1841 join_keys, right_ax, sort=sort
1842 )
1843 else:
1844 jkey = join_keys[0]
1846 left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort)
1848 if sort or len(left_ax) != len(left_indexer):
1849 # if asked to sort or there are 1-to-many matches
1850 join_index = left_ax.take(left_indexer)
1851 return join_index, left_indexer, right_indexer
1853 # left frame preserves order & length of its index
1854 return left_ax, None, right_indexer
1857def _right_outer_join(x, y, max_groups):
1858 right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups)
1859 return left_indexer, right_indexer
1862_join_functions = {
1863 "inner": libjoin.inner_join,
1864 "left": libjoin.left_outer_join,
1865 "right": _right_outer_join,
1866 "outer": libjoin.full_outer_join,
1867}
1870def _factorize_keys(lk, rk, sort=True):
1871 # Some pre-processing for non-ndarray lk / rk
1872 if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
1873 lk = getattr(lk, "_values", lk)._data
1874 rk = getattr(rk, "_values", rk)._data
1876 elif (
1877 is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk)
1878 ):
1879 if lk.categories.equals(rk.categories):
1880 # if we exactly match in categories, allow us to factorize on codes
1881 rk = rk.codes
1882 else:
1883 # Same categories in different orders -> recode
1884 rk = _recode_for_categories(rk.codes, rk.categories, lk.categories)
1886 lk = ensure_int64(lk.codes)
1887 rk = ensure_int64(rk)
1889 elif (
1890 is_extension_array_dtype(lk.dtype)
1891 and is_extension_array_dtype(rk.dtype)
1892 and lk.dtype == rk.dtype
1893 ):
1894 lk, _ = lk._values_for_factorize()
1895 rk, _ = rk._values_for_factorize()
1897 if is_integer_dtype(lk) and is_integer_dtype(rk):
1898 # GH#23917 TODO: needs tests for case where lk is integer-dtype
1899 # and rk is datetime-dtype
1900 klass = libhashtable.Int64Factorizer
1901 lk = ensure_int64(com.values_from_object(lk))
1902 rk = ensure_int64(com.values_from_object(rk))
1903 elif issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass(
1904 rk.dtype.type, (np.timedelta64, np.datetime64)
1905 ):
1906 # GH#23917 TODO: Needs tests for non-matching dtypes
1907 klass = libhashtable.Int64Factorizer
1908 lk = ensure_int64(com.values_from_object(lk))
1909 rk = ensure_int64(com.values_from_object(rk))
1910 else:
1911 klass = libhashtable.Factorizer
1912 lk = ensure_object(lk)
1913 rk = ensure_object(rk)
1915 rizer = klass(max(len(lk), len(rk)))
1917 llab = rizer.factorize(lk)
1918 rlab = rizer.factorize(rk)
1920 count = rizer.get_count()
1922 if sort:
1923 uniques = rizer.uniques.to_array()
1924 llab, rlab = _sort_labels(uniques, llab, rlab)
1926 # NA group
1927 lmask = llab == -1
1928 lany = lmask.any()
1929 rmask = rlab == -1
1930 rany = rmask.any()
1932 if lany or rany:
1933 if lany:
1934 np.putmask(llab, lmask, count)
1935 if rany:
1936 np.putmask(rlab, rmask, count)
1937 count += 1
1939 return llab, rlab, count
1942def _sort_labels(uniques: np.ndarray, left, right):
1943 if not isinstance(uniques, np.ndarray):
1944 # tuplesafe
1945 uniques = Index(uniques).values
1947 llength = len(left)
1948 labels = np.concatenate([left, right])
1950 _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
1951 new_labels = ensure_int64(new_labels)
1952 new_left, new_right = new_labels[:llength], new_labels[llength:]
1954 return new_left, new_right
1957def _get_join_keys(llab, rlab, shape, sort: bool):
1959 # how many levels can be done without overflow
1960 pred = lambda i: not is_int64_overflow_possible(shape[:i])
1961 nlev = next(filter(pred, range(len(shape), 0, -1)))
1963 # get keys for the first `nlev` levels
1964 stride = np.prod(shape[1:nlev], dtype="i8")
1965 lkey = stride * llab[0].astype("i8", subok=False, copy=False)
1966 rkey = stride * rlab[0].astype("i8", subok=False, copy=False)
1968 for i in range(1, nlev):
1969 with np.errstate(divide="ignore"):
1970 stride //= shape[i]
1971 lkey += llab[i] * stride
1972 rkey += rlab[i] * stride
1974 if nlev == len(shape): # all done!
1975 return lkey, rkey
1977 # densify current keys to avoid overflow
1978 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
1980 llab = [lkey] + llab[nlev:]
1981 rlab = [rkey] + rlab[nlev:]
1982 shape = [count] + shape[nlev:]
1984 return _get_join_keys(llab, rlab, shape, sort)
1987def _should_fill(lname, rname) -> bool:
1988 if not isinstance(lname, str) or not isinstance(rname, str):
1989 return True
1990 return lname == rname
1993def _any(x) -> bool:
1994 return x is not None and com.any_not_none(*x)
1997def _validate_operand(obj: FrameOrSeries) -> "DataFrame":
1998 if isinstance(obj, ABCDataFrame):
1999 return obj
2000 elif isinstance(obj, ABCSeries):
2001 if obj.name is None:
2002 raise ValueError("Cannot merge a Series without a name")
2003 else:
2004 return obj.to_frame()
2005 else:
2006 raise TypeError(
2007 "Can only merge Series or DataFrame objects, "
2008 "a {obj} was passed".format(obj=type(obj))
2009 )
2012def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix):
2013 """
2014 If two indices overlap, add suffixes to overlapping entries.
2016 If corresponding suffix is empty, the entry is simply converted to string.
2018 """
2019 to_rename = left.intersection(right)
2020 if len(to_rename) == 0:
2021 return left, right
2023 if not lsuffix and not rsuffix:
2024 raise ValueError(
2025 "columns overlap but no suffix specified: "
2026 "{rename}".format(rename=to_rename)
2027 )
2029 def renamer(x, suffix):
2030 """
2031 Rename the left and right indices.
2033 If there is overlap, and suffix is not None, add
2034 suffix, otherwise, leave it as-is.
2036 Parameters
2037 ----------
2038 x : original column name
2039 suffix : str or None
2041 Returns
2042 -------
2043 x : renamed column name
2044 """
2045 if x in to_rename and suffix is not None:
2046 return "{x}{suffix}".format(x=x, suffix=suffix)
2047 return x
2049 lrenamer = partial(renamer, suffix=lsuffix)
2050 rrenamer = partial(renamer, suffix=rsuffix)
2052 return (_transform_index(left, lrenamer), _transform_index(right, rrenamer))