Skip to content

BUG: edge case when reading from postgresl with read_sql_query and datetime with tz and chunksize #11216

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 3, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 28 additions & 11 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pandas.core.api import DataFrame, Series
from pandas.core.common import isnull
from pandas.core.base import PandasObject
from pandas.core.dtypes import DatetimeTZDtype
from pandas.tseries.tools import to_datetime
from pandas.util.decorators import Appender

Expand Down Expand Up @@ -89,6 +90,10 @@ def _handle_date_column(col, format=None):
# parse dates as timestamp
format = 's' if format is None else format
return to_datetime(col, errors='coerce', unit=format, utc=True)
elif com.is_datetime64tz_dtype(col):
# coerce to UTC timezone
# GH11216
return to_datetime(col,errors='coerce').astype('datetime64[ns, UTC]')
else:
return to_datetime(col, errors='coerce', format=format, utc=True)

Expand All @@ -113,6 +118,14 @@ def _parse_date_columns(data_frame, parse_dates):
fmt = None
data_frame[col_name] = _handle_date_column(df_col, format=fmt)


# we want to coerce datetime64_tz dtypes for now
# we could in theory do a 'nice' conversion from a FixedOffset tz
# GH11216
for col_name, df_col in data_frame.iteritems():
if com.is_datetime64tz_dtype(df_col):
data_frame[col_name] = _handle_date_column(df_col)

return data_frame


Expand Down Expand Up @@ -366,7 +379,7 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None,
----------
sql : string SQL query or SQLAlchemy Selectable (select or text object)
to be executed.
con : SQLAlchemy connectable(engine/connection) or database string URI
con : SQLAlchemy connectable(engine/connection) or database string URI
or sqlite3 DBAPI2 connection
Using SQLAlchemy makes it possible to use any DB supported by that
library.
Expand Down Expand Up @@ -898,11 +911,10 @@ def _harmonize_columns(self, parse_dates=None):
try:
df_col = self.frame[col_name]
# the type the dataframe column should have
col_type = self._numpy_type(sql_col.type)
col_type = self._get_dtype(sql_col.type)

if col_type is datetime or col_type is date:
if not issubclass(df_col.dtype.type, np.datetime64):
self.frame[col_name] = _handle_date_column(df_col)
if col_type is datetime or col_type is date or col_type is DatetimeTZDtype:
self.frame[col_name] = _handle_date_column(df_col)

elif col_type is float:
# floats support NA, can always convert!
Expand Down Expand Up @@ -982,20 +994,25 @@ def _sqlalchemy_type(self, col):

return Text

def _numpy_type(self, sqltype):
from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date
def _get_dtype(self, sqltype):
from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP

if isinstance(sqltype, Float):
return float
if isinstance(sqltype, Integer):
elif isinstance(sqltype, Integer):
# TODO: Refine integer size.
return np.dtype('int64')
if isinstance(sqltype, DateTime):
elif isinstance(sqltype, TIMESTAMP):
# we have a timezone capable type
if not sqltype.timezone:
return datetime
return DatetimeTZDtype
elif isinstance(sqltype, DateTime):
# Caution: np.datetime64 is also a subclass of np.number.
return datetime
if isinstance(sqltype, Date):
elif isinstance(sqltype, Date):
return date
if isinstance(sqltype, Boolean):
elif isinstance(sqltype, Boolean):
return bool
return object

Expand Down
79 changes: 62 additions & 17 deletions pandas/io/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@
import nose
import warnings
import numpy as np
import pandas as pd

from datetime import datetime, date, time

from pandas import DataFrame, Series, Index, MultiIndex, isnull, concat
from pandas import date_range, to_datetime, to_timedelta, Timestamp
import pandas.compat as compat
from pandas.compat import StringIO, range, lrange, string_types
from pandas.core import common as com
from pandas.core.datetools import format as date_format

import pandas.io.sql as sql
Expand Down Expand Up @@ -1248,6 +1250,66 @@ def test_default_date_load(self):
self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64),
"DateCol loaded with incorrect type")

def test_datetime_with_timezone(self):
# edge case that converts postgresql datetime with time zone types
# to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok
# but should be more natural, so coerce to datetime64[ns] for now

def check(col):
# check that a column is either datetime64[ns]
# or datetime64[ns, UTC]
if com.is_datetime64_dtype(col.dtype):

# "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00"
self.assertEqual(col[0], Timestamp('2000-01-01 08:00:00'))

# "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00"
self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00'))

elif com.is_datetime64tz_dtype(col.dtype):
self.assertTrue(str(col.dt.tz) == 'UTC')

# "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00"
self.assertEqual(col[0], Timestamp('2000-01-01 08:00:00', tz='UTC'))

# "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00"
self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00', tz='UTC'))

else:
raise AssertionError("DateCol loaded with incorrect type -> {0}".format(col.dtype))

# GH11216
df = pd.read_sql_query("select * from types_test_data", self.conn)
if not hasattr(df,'DateColWithTz'):
raise nose.SkipTest("no column with datetime with time zone")

# this is parsed on Travis (linux), but not on macosx for some reason
# even with the same versions of psycopg2 & sqlalchemy, possibly a Postgrsql server
# version difference
col = df.DateColWithTz
self.assertTrue(com.is_object_dtype(col.dtype) or com.is_datetime64_dtype(col.dtype) \
or com.is_datetime64tz_dtype(col.dtype),
"DateCol loaded with incorrect type -> {0}".format(col.dtype))

df = pd.read_sql_query("select * from types_test_data", self.conn, parse_dates=['DateColWithTz'])
if not hasattr(df,'DateColWithTz'):
raise nose.SkipTest("no column with datetime with time zone")
check(df.DateColWithTz)

df = pd.concat(list(pd.read_sql_query("select * from types_test_data",
self.conn,chunksize=1)),ignore_index=True)
col = df.DateColWithTz
self.assertTrue(com.is_datetime64tz_dtype(col.dtype),
"DateCol loaded with incorrect type -> {0}".format(col.dtype))
self.assertTrue(str(col.dt.tz) == 'UTC')
expected = sql.read_sql_table("types_test_data", self.conn)
tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz.astype('datetime64[ns, UTC]'))

# xref #7139
# this might or might not be converted depending on the postgres driver
df = sql.read_sql_table("types_test_data", self.conn)
check(df.DateColWithTz)

def test_date_parsing(self):
# No Parsing
df = sql.read_sql_table("types_test_data", self.conn)
Expand Down Expand Up @@ -1746,23 +1808,6 @@ def test_schema_support(self):
res2 = pdsql.read_table('test_schema_other2')
tm.assert_frame_equal(res1, res2)

def test_datetime_with_time_zone(self):

# Test to see if we read the date column with timezones that
# the timezone information is converted to utc and into a
# np.datetime64 (GH #7139)

df = sql.read_sql_table("types_test_data", self.conn)
self.assertTrue(issubclass(df.DateColWithTz.dtype.type, np.datetime64),
"DateColWithTz loaded with incorrect type -> {0}".format(df.DateColWithTz.dtype))

# "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00"
self.assertEqual(df.DateColWithTz[0], Timestamp('2000-01-01 08:00:00'))

# "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00"
self.assertEqual(df.DateColWithTz[1], Timestamp('2000-06-01 07:00:00'))


class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy):
pass

Expand Down