DOC: add Comparison with Excel (#38554)

afeld · web-flow · commit 26a679af139b · 2020-12-28T11:52:13.000-05:00
diff --git a/doc/source/_static/excel_pivot.png b/doc/source/_static/excel_pivot.png
diff --git a/doc/source/_static/logo_excel.svg b/doc/source/_static/logo_excel.svg
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Generator: Adobe Illustrator 23.0.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Livello_1" xmlns:x="http://ns.adobe.com/Extensibility/1.0/" xmlns:i="http://ns.adobe.com/AdobeIllustrator/10.0/" xmlns:graph="http://ns.adobe.com/Graphs/1.0/" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 2289.75 2130" enable-background="new 0 0 2289.75 2130" xml:space="preserve">
+<metadata>
+	<sfw xmlns="http://ns.adobe.com/SaveForWeb/1.0/">
+		<slices/>
+		<sliceSourceBounds bottomLeftOrigin="true" height="2130" width="2289.75" x="-1147.5" y="-1041"/>
+	</sfw>
+</metadata>
+<path fill="#185C37" d="M1437.75,1011.75L532.5,852v1180.393c0,53.907,43.7,97.607,97.607,97.607l0,0h1562.036  c53.907,0,97.607-43.7,97.607-97.607l0,0V1597.5L1437.75,1011.75z"/>
+<path fill="#21A366" d="M1437.75,0H630.107C576.2,0,532.5,43.7,532.5,97.607c0,0,0,0,0,0V532.5l905.25,532.5L1917,1224.75  L2289.75,1065V532.5L1437.75,0z"/>
+<path fill="#107C41" d="M532.5,532.5h905.25V1065H532.5V532.5z"/>
+<path opacity="0.1" enable-background="new    " d="M1180.393,426H532.5v1331.25h647.893c53.834-0.175,97.432-43.773,97.607-97.607  V523.607C1277.825,469.773,1234.227,426.175,1180.393,426z"/>
+<path opacity="0.2" enable-background="new    " d="M1127.143,479.25H532.5V1810.5h594.643  c53.834-0.175,97.432-43.773,97.607-97.607V576.857C1224.575,523.023,1180.977,479.425,1127.143,479.25z"/>
+<path opacity="0.2" enable-background="new    " d="M1127.143,479.25H532.5V1704h594.643c53.834-0.175,97.432-43.773,97.607-97.607  V576.857C1224.575,523.023,1180.977,479.425,1127.143,479.25z"/>
+<path opacity="0.2" enable-background="new    " d="M1073.893,479.25H532.5V1704h541.393c53.834-0.175,97.432-43.773,97.607-97.607  V576.857C1171.325,523.023,1127.727,479.425,1073.893,479.25z"/>
+<linearGradient id="SVGID_1_" gradientUnits="userSpaceOnUse" x1="203.5132" y1="1729.0183" x2="967.9868" y2="404.9817" gradientTransform="matrix(1 0 0 -1 0 2132)">
+	<stop offset="0" style="stop-color:#18884F"/>
+	<stop offset="0.5" style="stop-color:#117E43"/>
+	<stop offset="1" style="stop-color:#0B6631"/>
+</linearGradient>
+<path fill="url(#SVGID_1_)" d="M97.607,479.25h976.285c53.907,0,97.607,43.7,97.607,97.607v976.285  c0,53.907-43.7,97.607-97.607,97.607H97.607C43.7,1650.75,0,1607.05,0,1553.143V576.857C0,522.95,43.7,479.25,97.607,479.25z"/>
+<path fill="#FFFFFF" d="M302.3,1382.264l205.332-318.169L319.5,747.683h151.336l102.666,202.35  c9.479,19.223,15.975,33.494,19.49,42.919h1.331c6.745-15.336,13.845-30.228,21.3-44.677L725.371,747.79h138.929l-192.925,314.548  L869.2,1382.263H721.378L602.79,1160.158c-5.586-9.45-10.326-19.376-14.164-29.66h-1.757c-3.474,10.075-8.083,19.722-13.739,28.755  l-122.102,223.011H302.3z"/>
+<path fill="#33C481" d="M2192.143,0H1437.75v532.5h852V97.607C2289.75,43.7,2246.05,0,2192.143,0L2192.143,0z"/>
+<path fill="#107C41" d="M1437.75,1065h852v532.5h-852V1065z"/>
+</svg>
diff --git a/doc/source/getting_started/comparison/comparison_boilerplate.rst b/doc/source/getting_started/comparison/comparison_boilerplate.rst
@@ -0,0 +1,9 @@
+If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>`
+to familiarize yourself with the library.
+
+As is customary, we import pandas and NumPy as follows:
+
+.. ipython:: python
+
+    import pandas as pd
+    import numpy as np
diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst
@@ -8,16 +8,7 @@ For potential users coming from `SAS <https://en.wikipedia.org/wiki/SAS_(softwar
 this page is meant to demonstrate how different SAS operations would be
 performed in pandas.
 
-If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>`
-to familiarize yourself with the library.
-
-As is customary, we import pandas and NumPy as follows:
-
-.. ipython:: python
-
-    import pandas as pd
-    import numpy as np
-
+.. include:: comparison_boilerplate.rst
 
 .. note::
 
@@ -48,14 +39,17 @@ General terminology translation
     ``NaN``, ``.``
 
 
-``DataFrame`` / ``Series``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+``DataFrame``
+~~~~~~~~~~~~~
 
 A ``DataFrame`` in pandas is analogous to a SAS data set - a two-dimensional
 data source with labeled columns that can be of different types. As will be
 shown in this document, almost any operation that can be applied to a data set
 using SAS's ``DATA`` step, can also be accomplished in pandas.
 
+``Series``
+~~~~~~~~~~
+
 A ``Series`` is the data structure that represents one column of a
 ``DataFrame``. SAS doesn't have a separate data structure for a single column,
 but in general, working with a ``Series`` is analogous to referencing a column
diff --git a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst
@@ -0,0 +1,253 @@
+.. _compare_with_spreadsheets:
+
+{{ header }}
+
+Comparison with spreadsheets
+****************************
+
+Since many potential pandas users have some familiarity with spreadsheet programs like
+`Excel <https://support.microsoft.com/en-us/excel>`_, this page is meant to provide some examples
+of how various spreadsheet operations would be performed using pandas. This page will use
+terminology and link to documentation for Excel, but much will be the same/similar in
+`Google Sheets <https://support.google.com/a/users/answer/9282959>`_,
+`LibreOffice Calc <https://help.libreoffice.org/latest/en-US/text/scalc/main0000.html?DbPAR=CALC>`_,
+`Apple Numbers <https://www.apple.com/mac/numbers/compatibility/functions.html>`_, and other
+Excel-compatible spreadsheet software.
+
+.. include:: comparison_boilerplate.rst
+
+Data structures
+---------------
+
+General terminology translation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. csv-table::
+    :header: "pandas", "Excel"
+    :widths: 20, 20
+
+    ``DataFrame``, worksheet
+    ``Series``, column
+    ``Index``, row headings
+    row, row
+    ``NaN``, empty cell
+
+``DataFrame``
+~~~~~~~~~~~~~
+
+A ``DataFrame`` in pandas is analogous to an Excel worksheet. While an Excel worksheet can contain
+multiple worksheets, pandas ``DataFrame``\s exist independently.
+
+``Series``
+~~~~~~~~~~
+
+A ``Series`` is the data structure that represents one column of a ``DataFrame``. Working with a
+``Series`` is analogous to referencing a column of a spreadsheet.
+
+``Index``
+~~~~~~~~~
+
+Every ``DataFrame`` and ``Series`` has an ``Index``, which are labels on the *rows* of the data. In
+pandas, if no index is specified, a :class:`~pandas.RangeIndex` is used by default (first row = 0,
+second row = 1, and so on), analogous to row headings/numbers in spreadsheets.
+
+In pandas, indexes can be set to one (or multiple) unique values, which is like having a column that
+use use as the row identifier in a worksheet. Unlike spreadsheets, these ``Index`` values can actually be
+used to reference the rows. For example, in spreadsheets, you would reference the first row as ``A1:Z1``,
+while in pandas you could use ``populations.loc['Chicago']``.
+
+Index values are also persistent, so if you re-order the rows in a ``DataFrame``, the label for a
+particular row don't change.
+
+See the :ref:`indexing documentation<indexing>` for much more on how to use an ``Index``
+effectively.
+
+Commonly used spreadsheet functionalities
+-----------------------------------------
+
+Importing data
+~~~~~~~~~~~~~~
+
+Both `Excel <https://support.microsoft.com/en-us/office/import-data-from-external-data-sources-power-query-be4330b3-5356-486c-a168-b68e9e616f5a>`__
+and :ref:`pandas <10min_tut_02_read_write>` can import data from various sources in various
+formats.
+
+Excel files
+'''''''''''
+
+Excel opens `various Excel file formats <https://support.microsoft.com/en-us/office/file-formats-that-are-supported-in-excel-0943ff2c-6014-4e8d-aaea-b83d51d46247>`_
+by double-clicking them, or using `the Open menu <https://support.microsoft.com/en-us/office/open-files-from-the-file-menu-97f087d8-3136-4485-8e86-c5b12a8c4176>`_.
+In pandas, you use :ref:`special methods for reading and writing from/to Excel files <io.excel>`.
+
+CSV
+'''
+
+Let's load and display the `tips <https://github.com/pandas-dev/pandas/blob/master/pandas/tests/io/data/csv/tips.csv>`_
+dataset from the pandas tests, which is a CSV file. In Excel, you would download and then
+`open the CSV <https://support.microsoft.com/en-us/office/import-or-export-text-txt-or-csv-files-5250ac4c-663c-47ce-937b-339e391393ba>`_.
+In pandas, you pass the URL or local path of the CSV file to :func:`~pandas.read_csv`:
+
+.. ipython:: python
+
+   url = (
+       "https://raw.github.com/pandas-dev"
+       "/pandas/master/pandas/tests/io/data/csv/tips.csv"
+   )
+   tips = pd.read_csv(url)
+   tips
+
+Fill Handle
+~~~~~~~~~~~
+
+Create a series of numbers following a set pattern in a certain set of cells. In
+a spreadsheet, this would be done by shift+drag after entering the first number or by
+entering the first two or three values and then dragging.
+
+This can be achieved by creating a series and assigning it to the desired cells.
+
+.. ipython:: python
+
+    df = pd.DataFrame({"AAA": [1] * 8, "BBB": list(range(0, 8))})
+    df
+
+    series = list(range(1, 5))
+    series
+
+    df.loc[2:5, "AAA"] = series
+
+    df
+
+Filters
+~~~~~~~
+
+Filters can be achieved by using slicing.
+
+The examples filter by 0 on column AAA, and also show how to filter by multiple
+values.
+
+.. ipython:: python
+
+   df[df.AAA == 0]
+
+   df[(df.AAA == 0) | (df.AAA == 2)]
+
+
+Drop Duplicates
+~~~~~~~~~~~~~~~
+
+Excel has built-in functionality for `removing duplicate values <https://support.microsoft.com/en-us/office/find-and-remove-duplicates-00e35bea-b46a-4d5d-b28e-66a552dc138d>`_.
+This is supported in pandas via :meth:`~DataFrame.drop_duplicates`.
+
+.. ipython:: python
+
+    df = pd.DataFrame(
+        {
+            "class": ["A", "A", "A", "B", "C", "D"],
+            "student_count": [42, 35, 42, 50, 47, 45],
+            "all_pass": ["Yes", "Yes", "Yes", "No", "No", "Yes"],
+        }
+    )
+
+    df.drop_duplicates()
+
+    df.drop_duplicates(["class", "student_count"])
+
+
+Pivot Tables
+~~~~~~~~~~~~
+
+`PivotTables <https://support.microsoft.com/en-us/office/create-a-pivottable-to-analyze-worksheet-data-a9a84538-bfe9-40a9-a8e9-f99134456576>`_
+from spreadsheets can be replicated in pandas through :ref:`reshaping`. Using the ``tips`` dataset again,
+let's find the average gratuity by size of the party and sex of the server.
+
+In Excel, we use the following configuration for the PivotTable:
+
+.. image:: ../../_static/excel_pivot.png
+   :align: center
+
+The equivalent in pandas:
+
+.. ipython:: python
+
+    pd.pivot_table(
+        tips, values="tip", index=["size"], columns=["sex"], aggfunc=np.average
+    )
+
+Formulas
+~~~~~~~~
+
+In spreadsheets, `formulas <https://support.microsoft.com/en-us/office/overview-of-formulas-in-excel-ecfdc708-9162-49e8-b993-c311f47ca173>`_
+are often created in individual cells and then `dragged <https://support.microsoft.com/en-us/office/copy-a-formula-by-dragging-the-fill-handle-in-excel-for-mac-dd928259-622b-473f-9a33-83aa1a63e218>`_
+into other cells to compute them for other columns. In pandas, you'll be doing more operations on
+full columns.
+
+As an example, let's create a new column "girls_count" and try to compute the number of boys in
+each class.
+
+.. ipython:: python
+
+    df["girls_count"] = [21, 12, 21, 31, 23, 17]
+    df
+    df["boys_count"] = df["student_count"] - df["girls_count"]
+    df
+
+Note that we aren't having to tell it to do that subtraction cell-by-cell — pandas handles that for
+us. See :ref:`how to create new columns derived from existing columns <10min_tut_05_columns>`.
+
+VLOOKUP
+~~~~~~~
+
+.. ipython:: python
+
+    import random
+
+    first_names = [
+        "harry",
+        "ron",
+        "hermione",
+        "rubius",
+        "albus",
+        "severus",
+        "luna",
+    ]
+    keys = [1, 2, 3, 4, 5, 6, 7]
+    df1 = pd.DataFrame({"keys": keys, "first_names": first_names})
+    df1
+
+    surnames = [
+        "hadrid",
+        "malfoy",
+        "lovegood",
+        "dumbledore",
+        "grindelwald",
+        "granger",
+        "weasly",
+        "riddle",
+        "longbottom",
+        "snape",
+    ]
+    keys = [random.randint(1, 7) for x in range(0, 10)]
+    random_names = pd.DataFrame({"surnames": surnames, "keys": keys})
+
+    random_names
+
+    random_names.merge(df1, on="keys", how="left")
+
+Adding a row
+~~~~~~~~~~~~
+
+To appended a row, we can just assign values to an index using :meth:`~DataFrame.loc`.
+
+NOTE: If the index already exists, the values in that index will be over written.
+
+.. ipython:: python
+
+    df1.loc[7] = [8, "tonks"]
+    df1
+
+
+Search and Replace
+~~~~~~~~~~~~~~~~~~
+
+The ``replace`` method that comes associated with the ``DataFrame`` object can perform
+this function. Please see `pandas.DataFrame.replace <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.replace.html>`__ for examples.
diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst
@@ -8,15 +8,7 @@ Since many potential pandas users have some familiarity with
 `SQL <https://en.wikipedia.org/wiki/SQL>`_, this page is meant to provide some examples of how
 various SQL operations would be performed using pandas.
 
-If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>`
-to familiarize yourself with the library.
-
-As is customary, we import pandas and NumPy as follows:
-
-.. ipython:: python
-
-    import pandas as pd
-    import numpy as np
+.. include:: comparison_boilerplate.rst
 
 Most of the examples will utilize the ``tips`` dataset found within pandas tests.  We'll read
 the data into a DataFrame called ``tips`` and assume we have a database table of the same name and
diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst
@@ -8,17 +8,7 @@ For potential users coming from `Stata <https://en.wikipedia.org/wiki/Stata>`__
 this page is meant to demonstrate how different Stata operations would be
 performed in pandas.
 
-If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>`
-to familiarize yourself with the library.
-
-As is customary, we import pandas and NumPy as follows. This means that we can refer to the
-libraries as ``pd`` and ``np``, respectively, for the rest of the document.
-
-.. ipython:: python
-
-    import pandas as pd
-    import numpy as np
-
+.. include:: comparison_boilerplate.rst
 
 .. note::
 
@@ -48,14 +38,17 @@ General terminology translation
     ``NaN``, ``.``
 
 
-``DataFrame`` / ``Series``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+``DataFrame``
+~~~~~~~~~~~~~
 
 A ``DataFrame`` in pandas is analogous to a Stata data set -- a two-dimensional
 data source with labeled columns that can be of different types. As will be
 shown in this document, almost any operation that can be applied to a data set
 in Stata can also be accomplished in pandas.
 
+``Series``
+~~~~~~~~~~
+
 A ``Series`` is the data structure that represents one column of a
 ``DataFrame``. Stata doesn't have a separate data structure for a single column,
 but in general, working with a ``Series`` is analogous to referencing a column
diff --git a/doc/source/getting_started/comparison/index.rst b/doc/source/getting_started/comparison/index.rst
@@ -11,5 +11,6 @@ Comparison with other tools
 
     comparison_with_r
     comparison_with_sql
+    comparison_with_spreadsheets
     comparison_with_sas
     comparison_with_stata
diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst
diff --git a/doc/source/getting_started/intro_tutorials/05_add_columns.rst b/doc/source/getting_started/intro_tutorials/05_add_columns.rst