Merge pull request #3504 from pjob/s3-support

y-p · y-p · commit 1381c394bd26 · 2013-05-10T07:33:46.000-07:00
ENH: Support reading from S3
diff --git a/README.rst b/README.rst
@@ -90,6 +90,7 @@ Optional dependencies
      * openpyxl version 1.6.1 or higher, for writing .xlsx files
      * xlrd >= 0.9.0
      * Needed for Excel I/O
+  * `boto <https://pypi.python.org/pypi/boto>`__: necessary for Amazon S3 access.
 
 
 Installation from sources
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -32,6 +32,7 @@ pandas 0.11.1
 
   - pd.read_html() can now parse HTML string, files or urls and return dataframes
     courtesy of @cpcloud. (GH3477_)
+  - Support for reading Amazon S3 files. (GH3504_)
 
 **Improvements to existing features**
 
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -40,8 +40,9 @@ for some advanced strategies
 
 They can take a number of arguments:
 
-  - ``filepath_or_buffer``: Either a string path to a file, or any object with a
-    ``read`` method (such as an open file or ``StringIO``).
+  - ``filepath_or_buffer``: Either a string path to a file, url
+    (including http, ftp, and s3 locations), or any object with a ``read``
+    method (such as an open file or ``StringIO``).
   - ``sep`` or ``delimiter``: A delimiter / separator to split fields
     on. `read_csv` is capable of inferring the delimiter automatically in some
     cases by "sniffing." The separator may be specified as a regular
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -34,7 +34,7 @@ class DateConversionError(Exception):
 Parameters
 ----------
 filepath_or_buffer : string or file handle / StringIO. The string could be
-    a URL. Valid URL schemes include http, ftp, and file. For file URLs, a host
+    a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host
     is expected. For instance, a local file could be
     file ://localhost/path/to/table.csv
 %s
@@ -188,6 +188,12 @@ def _is_url(url):
     except:
         return False
 
+def _is_s3_url(url):
+    """ Check for an s3 url """
+    try:
+        return urlparse.urlparse(url).scheme == 's3'
+    except:
+        return False
 
 def _read(filepath_or_buffer, kwds):
     "Generic reader of line files."
@@ -196,17 +202,32 @@ def _read(filepath_or_buffer, kwds):
     if skipfooter is not None:
         kwds['skip_footer'] = skipfooter
 
-    if isinstance(filepath_or_buffer, basestring) and _is_url(filepath_or_buffer):
-        from urllib2 import urlopen
-        filepath_or_buffer = urlopen(filepath_or_buffer)
-        if py3compat.PY3:  # pragma: no cover
-            if encoding:
-                errors = 'strict'
-            else:
-                errors = 'replace'
-                encoding = 'utf-8'
-            bytes = filepath_or_buffer.read()
-            filepath_or_buffer = StringIO(bytes.decode(encoding, errors))
+    if isinstance(filepath_or_buffer, basestring):
+        if _is_url(filepath_or_buffer):
+            from urllib2 import urlopen
+            filepath_or_buffer = urlopen(filepath_or_buffer)
+            if py3compat.PY3:  # pragma: no cover
+                if encoding:
+                    errors = 'strict'
+                else:
+                    errors = 'replace'
+                    encoding = 'utf-8'
+                bytes = filepath_or_buffer.read()
+                filepath_or_buffer = StringIO(bytes.decode(encoding, errors))
+
+        if _is_s3_url(filepath_or_buffer):
+            try:
+                import boto
+            except:
+                raise ImportError("boto is required to handle s3 files")
+            # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
+            # are environment variables
+            parsed_url = urlparse.urlparse(filepath_or_buffer)
+            conn = boto.connect_s3()
+            b = conn.get_bucket(parsed_url.netloc)
+            k = boto.s3.key.Key(b)
+            k.key = parsed_url.path
+            filepath_or_buffer = StringIO(k.get_contents_as_string())
 
     if kwds.get('date_parser', None) is not None:
         if isinstance(kwds['parse_dates'], bool):