Skip to content

PERF: Improve performance CustmBusinessDay #8253

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ Performance
- Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`)
- Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`)
- Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)

- Performance improvement in ``CustomBusinessDay``, ``CustomBusinessMonth`` (:issue:`8236`)



Expand Down
157 changes: 96 additions & 61 deletions pandas/tseries/offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,66 @@ def _is_normalized(dt):
return False
return True

def _to_dt64(dt, dtype='datetime64'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

their already is a _to_dt64 how is this different?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This _to_dt64 used to be a method of CustomBusinessDay. I am not sure how it is different. But I can profile the two and try to see if we can get rid of this one.

# Currently
# > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]')
# numpy.datetime64('2013-05-01T02:00:00.000000+0200')
# Thus astype is needed to cast datetime to datetime64[D]

if getattr(dt, 'tzinfo', None) is not None:
i8 = tslib.pydt_to_i8(dt)
dt = tslib.tz_convert_single(i8, 'UTC', dt.tzinfo)
dt = Timestamp(dt)
dt = np.datetime64(dt)
if dt.dtype.name != dtype:
dt = dt.astype(dtype)
return dt

def from_various_func(cls,**kwds):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this function needed? this completely changes the API.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason increments >1 are slow is because a new instance of CustomBusinessDay is created when applying operators.
e.g.

    date + 2 * cbday

Before the constructor had to parse the holidays which is very costly. This new function does the heavy lifting when you initialize CustomBusinessDay. After that all computations are fast now because the constructor is very short.

Leaving the API unchanged would mean changing the logic behind DateOffset.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can simply pass in an already created holidays instance I think. You can't change the API like this. Totally not intuitve and introducing a weird creation scheme. You might also be able to cache the holiday (in the class), might be the best option.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I agree that the proposed API is not intuitive. I can try if we can have np.busdaycalendar cached in kwds.

Converts various holiday inputs to format readable for np.busdaycalendar
and creates an instance of CustomBusinessDay, CustomBusinessMonth...

Parameters
----------
weekmask : str, Default 'Mon Tue Wed Thu Fri'
weekmask of valid business days, passed to ``numpy.busdaycalendar``
holidays : list
list/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``
calendar : HolidayCalendar instance
instance of AbstractHolidayCalendar that provide the list of holidays
"""
bdaycal_kwds = {}
if 'calendar' in kwds or 'holidays' in kwds: # generate holiday list
if 'calendar' in kwds:
holidays = kwds.pop('calendar').holidays()
else:
holidays = kwds.pop('holidays')
holidays = [_to_dt64(dt, dtype='datetime64[D]') for dt in
holidays]
holidays = tuple(sorted(holidays))
bdaycal_kwds['holidays'] = holidays
if 'weekmask' in kwds:
bdaycal_kwds['weekmask'] = kwds.pop('weekmask') # simply pass on to numpy

try:
busdaycalendar = np.busdaycalendar(**bdaycal_kwds)
except:
# Check we have the required numpy version
from distutils.version import LooseVersion

if LooseVersion(np.__version__) < '1.7.0':
raise NotImplementedError("CustomBusinessDay requires numpy >= "
"1.7.0. Current version: " +
np.__version__)
else:
raise

return cls(busdaycalendar=busdaycalendar, **kwds)



#----------------------------------------------------------------------
# DateOffset

Expand Down Expand Up @@ -344,17 +404,23 @@ def rollforward(self, dt):
return dt

def onOffset(self, dt):
# print('in onOffset')
if self.normalize and not _is_normalized(dt):
print('here')
return False
# XXX, see #1395
if type(self) == DateOffset or isinstance(self, Tick):
# print('Dateoffset')
return True

# Default (slow) method for determining if some date is a member of the
# date range generated by this offset. Subclasses may have this
# re-implemented in a nicer way.
a = dt
b = ((dt + self) - self)
# print(self)
# print('a={}'.format(a))
# print('b={}'.format(b))
return a == b

# way to get around weirdness with rule_code
Expand Down Expand Up @@ -542,13 +608,9 @@ class CustomBusinessDay(BusinessDay):
offset : timedelta, default timedelta(0)
normalize : bool, default False
Normalize start/end dates to midnight before generating date range
weekmask : str, Default 'Mon Tue Wed Thu Fri'
weekmask of valid business days, passed to ``numpy.busdaycalendar``
holidays : list
list/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``
calendar : HolidayCalendar instance
instance of AbstractHolidayCalendar that provide the list of holidays
busdaycalendar: np.busdaycalendar instance
Use `CustomBusinessDay.from_various` to instantiate from weekmask, list of holidays
or `HolidayCalendar` instance.
"""

_cacheable = False
Expand All @@ -559,36 +621,9 @@ def __init__(self, n=1, normalize=False, **kwds):
self.normalize = normalize
self.kwds = kwds
self.offset = kwds.get('offset', timedelta(0))
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')

if 'calendar' in kwds:
holidays = kwds['calendar'].holidays()
else:
holidays = kwds.get('holidays', [])
holidays = [self._to_dt64(dt, dtype='datetime64[D]') for dt in
holidays]
self.holidays = tuple(sorted(holidays))
self.kwds['holidays'] = self.holidays
self.busdaycalendar = kwds.get('busdaycalendar', np.busdaycalendar())

self._set_busdaycalendar()

def _set_busdaycalendar(self):
if self.holidays:
kwargs = {'weekmask':self.weekmask,'holidays':self.holidays}
else:
kwargs = {'weekmask':self.weekmask}
try:
self.busdaycalendar = np.busdaycalendar(**kwargs)
except:
# Check we have the required numpy version
from distutils.version import LooseVersion

if LooseVersion(np.__version__) < '1.7.0':
raise NotImplementedError("CustomBusinessDay requires numpy >= "
"1.7.0. Current version: " +
np.__version__)
else:
raise
from_various = classmethod(from_various_func)

def __getstate__(self):
"""Return a pickleable state"""
Expand Down Expand Up @@ -783,26 +818,26 @@ class CustomBusinessMonthEnd(BusinessMixin, MonthOffset):
offset : timedelta, default timedelta(0)
normalize : bool, default False
Normalize start/end dates to midnight before generating date range
weekmask : str, Default 'Mon Tue Wed Thu Fri'
weekmask of valid business days, passed to ``numpy.busdaycalendar``
holidays : list
list/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``
busdaycalendar: np.busdaycalendar instance
Use `CustomBusinessDay.from_various` to instantiate from weekmask, list of holidays
or `HolidayCalendar` instance.
"""

_cacheable = False
_prefix = 'CBM'
def __init__(self, n=1, normalize=False, **kwds):
self.n = int(n)
self.normalize = normalize
self.kwds = kwds
self.kwds = kwds.copy()
self.offset = kwds.get('offset', timedelta(0))
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')
self.cbday = CustomBusinessDay(n=self.n, **kwds)
self.m_offset = MonthEnd()
self.cbday = CustomBusinessDay(n=1, normalize=normalize, **kwds)
self.busdaycalendar = kwds.pop('busdaycalendar', np.busdaycalendar())
self.m_offset = MonthEnd(n=1, normalize=normalize, **kwds)

from_various = classmethod(from_various_func)

@apply_wraps
def apply(self,other):
def apply(self, other):
n = self.n
# First move to month offset
cur_mend = self.m_offset.rollforward(other)
Expand All @@ -817,11 +852,11 @@ def apply(self,other):
n -= 1
elif other > cur_cmend and n <= -1:
n += 1
new = cur_mend + n * MonthEnd()

new = cur_mend + n * self.m_offset
result = self.cbday.rollback(new)
return result

class CustomBusinessMonthBegin(BusinessMixin, MonthOffset):
"""
**EXPERIMENTAL** DateOffset of one custom business month
Expand All @@ -837,26 +872,26 @@ class CustomBusinessMonthBegin(BusinessMixin, MonthOffset):
offset : timedelta, default timedelta(0)
normalize : bool, default False
Normalize start/end dates to midnight before generating date range
weekmask : str, Default 'Mon Tue Wed Thu Fri'
weekmask of valid business days, passed to ``numpy.busdaycalendar``
holidays : list
list/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``
busdaycalendar: np.busdaycalendar instance
Use `CustomBusinessDay.from_various` to instantiate from weekmask, list of holidays
or `HolidayCalendar` instance.
"""

_cacheable = False
_prefix = 'CBMS'
def __init__(self, n=1, normalize=False, **kwds):
self.n = int(n)
self.normalize = normalize
self.kwds = kwds
self.kwds = kwds.copy()
self.offset = kwds.get('offset', timedelta(0))
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')
self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, **kwds)
self.m_offset = MonthBegin(normalize=normalize)
self.cbday = CustomBusinessDay(n=1, normalize=normalize, **kwds)
self.busdaycalendar = kwds.pop('busdaycalendar', np.busdaycalendar())
self.m_offset = MonthBegin(n=1, normalize=normalize, **kwds)

from_various = classmethod(from_various_func)

@apply_wraps
def apply(self,other):
def apply(self, other):
n = self.n
dt_in = other
# First move to month offset
Expand All @@ -872,8 +907,8 @@ def apply(self,other):
n += 1
elif dt_in < cur_cmbegin and n >= 1:
n -= 1
new = cur_mbegin + n * MonthBegin()

new = cur_mbegin + n * self.m_offset
result = self.cbday.rollforward(new)
return result

Expand Down
33 changes: 21 additions & 12 deletions pandas/tseries/tests/test_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ def _get_offset(self, klass, value=1, normalize=False):
klass = klass(n=value, weekday=5, normalize=normalize)
elif klass is DateOffset:
klass = klass(days=value, normalize=normalize)
# elif klass in [CDay, CBMonthEnd, CBMonthBegin]:
# klass = klass.from_various(n=value, normalize=normalize)
else:
try:
klass = klass(value, normalize=normalize)
Expand Down Expand Up @@ -808,7 +810,7 @@ def test_holidays(self):
# Define a TradingDay offset
holidays = ['2012-05-01', datetime(2013, 5, 1),
np.datetime64('2014-05-01')]
tday = CDay(holidays=holidays)
tday = CDay.from_various(holidays=holidays)
for year in range(2012, 2015):
dt = datetime(year, 4, 30)
xp = datetime(year, 5, 2)
Expand All @@ -819,9 +821,9 @@ def test_weekmask(self):
weekmask_saudi = 'Sat Sun Mon Tue Wed' # Thu-Fri Weekend
weekmask_uae = '1111001' # Fri-Sat Weekend
weekmask_egypt = [1,1,1,1,0,0,1] # Fri-Sat Weekend
bday_saudi = CDay(weekmask=weekmask_saudi)
bday_uae = CDay(weekmask=weekmask_uae)
bday_egypt = CDay(weekmask=weekmask_egypt)
bday_saudi = CDay.from_various(weekmask=weekmask_saudi)
bday_uae = CDay.from_various(weekmask=weekmask_uae)
bday_egypt = CDay.from_various(weekmask=weekmask_egypt)
dt = datetime(2013, 5, 1)
xp_saudi = datetime(2013, 5, 4)
xp_uae = datetime(2013, 5, 2)
Expand All @@ -838,15 +840,15 @@ def test_weekmask_and_holidays(self):
weekmask_egypt = 'Sun Mon Tue Wed Thu' # Fri-Sat Weekend
holidays = ['2012-05-01', datetime(2013, 5, 1),
np.datetime64('2014-05-01')]
bday_egypt = CDay(holidays=holidays, weekmask=weekmask_egypt)
bday_egypt = CDay.from_various(holidays=holidays, weekmask=weekmask_egypt)
dt = datetime(2013, 4, 30)
xp_egypt = datetime(2013, 5, 5)
self.assertEqual(xp_egypt, dt + 2 * bday_egypt)

def test_calendar(self):
calendar = USFederalHolidayCalendar()
dt = datetime(2014, 1, 17)
assertEq(CDay(calendar=calendar), dt, datetime(2014, 1, 21))
assertEq(CDay.from_various(calendar=calendar), dt, datetime(2014, 1, 21))

class CustomBusinessMonthBase(object):
_multiprocess_can_split_ = True
Expand Down Expand Up @@ -999,15 +1001,19 @@ def test_holidays(self):
# Define a TradingDay offset
holidays = ['2012-01-31', datetime(2012, 2, 28),
np.datetime64('2012-02-29')]
bm_offset = CBMonthEnd(holidays=holidays)
bm_offset = CBMonthEnd.from_various(holidays=holidays)
dt = datetime(2012,1,1)
self.assertEqual(dt + bm_offset,datetime(2012,1,30))
self.assertEqual(dt + 2*bm_offset,datetime(2012,2,27))

def test_datetimeindex(self):
from pandas.tseries.holiday import USFederalHolidayCalendar
self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthEnd(calendar=USFederalHolidayCalendar())).tolist()[0],
datetime(2012,1,31))
hcal = USFederalHolidayCalendar()
freq = CBMonthEnd.from_various(calendar=hcal)

self.assertEqual(DatetimeIndex(start='20120101',end='20130101',
freq=freq).tolist()[0],
datetime(2012,1,31))

class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base):
_object = CBMonthBegin
Expand Down Expand Up @@ -1114,14 +1120,17 @@ def test_holidays(self):
# Define a TradingDay offset
holidays = ['2012-02-01', datetime(2012, 2, 2),
np.datetime64('2012-03-01')]
bm_offset = CBMonthBegin(holidays=holidays)
bm_offset = CBMonthBegin.from_various(holidays=holidays)
dt = datetime(2012,1,1)
self.assertEqual(dt + bm_offset,datetime(2012,1,2))
self.assertEqual(dt + 2*bm_offset,datetime(2012,2,3))

def test_datetimeindex(self):
self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthBegin(calendar=USFederalHolidayCalendar())).tolist()[0],
datetime(2012,1,3))
hcal = USFederalHolidayCalendar()
cbmb = CBMonthBegin.from_various(calendar=hcal)
self.assertEqual(DatetimeIndex(start='20120101', end='20130101',
freq=cbmb).tolist()[0],
datetime(2012,1,3))


def assertOnOffset(offset, date, expected):
Expand Down
Loading