{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "731039f0-a993-430b-996a-cad1b809ac02",
   "metadata": {},
   "source": [
    "# Split-Apply-Combine"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f34f881f-7f9b-4e51-9ee7-8c50b877390c",
   "metadata": {},
   "source": [
    "Assume the following dataframe. It contains measurements of objects from 2 different files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6ad14083-e065-44e9-a5ef-69db286bfc67",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8e6ddf36-5f87-415e-a983-364168dee4c3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>intensity_mean</th>\n",
       "      <th>major_axis_length</th>\n",
       "      <th>minor_axis_length</th>\n",
       "      <th>aspect_ratio</th>\n",
       "      <th>file_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>139</td>\n",
       "      <td>96.546763</td>\n",
       "      <td>17.504104</td>\n",
       "      <td>10.292770</td>\n",
       "      <td>1.700621</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>360</td>\n",
       "      <td>86.613889</td>\n",
       "      <td>35.746808</td>\n",
       "      <td>14.983124</td>\n",
       "      <td>2.385805</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>43</td>\n",
       "      <td>91.488372</td>\n",
       "      <td>12.967884</td>\n",
       "      <td>4.351573</td>\n",
       "      <td>2.980045</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>140</td>\n",
       "      <td>73.742857</td>\n",
       "      <td>18.940508</td>\n",
       "      <td>10.314404</td>\n",
       "      <td>1.836316</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>144</td>\n",
       "      <td>89.375000</td>\n",
       "      <td>13.639308</td>\n",
       "      <td>13.458532</td>\n",
       "      <td>1.013432</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>305</td>\n",
       "      <td>88.252459</td>\n",
       "      <td>20.226532</td>\n",
       "      <td>19.244210</td>\n",
       "      <td>1.051045</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>593</td>\n",
       "      <td>89.905565</td>\n",
       "      <td>36.508370</td>\n",
       "      <td>21.365394</td>\n",
       "      <td>1.708762</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>289</td>\n",
       "      <td>106.851211</td>\n",
       "      <td>20.427809</td>\n",
       "      <td>18.221452</td>\n",
       "      <td>1.121086</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>277</td>\n",
       "      <td>100.664260</td>\n",
       "      <td>20.307965</td>\n",
       "      <td>17.432920</td>\n",
       "      <td>1.164920</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>46</td>\n",
       "      <td>70.869565</td>\n",
       "      <td>11.648895</td>\n",
       "      <td>5.298003</td>\n",
       "      <td>2.198733</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>111 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     area  intensity_mean  major_axis_length  minor_axis_length  aspect_ratio  \\\n",
       "0     139       96.546763          17.504104          10.292770      1.700621   \n",
       "1     360       86.613889          35.746808          14.983124      2.385805   \n",
       "2      43       91.488372          12.967884           4.351573      2.980045   \n",
       "3     140       73.742857          18.940508          10.314404      1.836316   \n",
       "4     144       89.375000          13.639308          13.458532      1.013432   \n",
       "..    ...             ...                ...                ...           ...   \n",
       "106   305       88.252459          20.226532          19.244210      1.051045   \n",
       "107   593       89.905565          36.508370          21.365394      1.708762   \n",
       "108   289      106.851211          20.427809          18.221452      1.121086   \n",
       "109   277      100.664260          20.307965          17.432920      1.164920   \n",
       "110    46       70.869565          11.648895           5.298003      2.198733   \n",
       "\n",
       "              file_name  \n",
       "0    20P1_POS0010_D_1UL  \n",
       "1    20P1_POS0010_D_1UL  \n",
       "2    20P1_POS0010_D_1UL  \n",
       "3    20P1_POS0010_D_1UL  \n",
       "4    20P1_POS0010_D_1UL  \n",
       "..                  ...  \n",
       "106  20P1_POS0007_D_1UL  \n",
       "107  20P1_POS0007_D_1UL  \n",
       "108  20P1_POS0007_D_1UL  \n",
       "109  20P1_POS0007_D_1UL  \n",
       "110  20P1_POS0007_D_1UL  \n",
       "\n",
       "[111 rows x 6 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('../../data/BBBC007_analysis.csv')\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d50bf409-5dd7-41b4-85a6-ae8d3189abb7",
   "metadata": {},
   "source": [
    "Let’s say we want to compute the median \"intensity_mean\" of round objects and also discriminate these objects per file. Ignoring for the second the mechanics of how we would do this with Python, let’s think about it in English. What do we need to do?\n",
    "\n",
    "- Split the data set up according to a 'round' criterion field, i.e., split it up so we have a separate data set for the two classes, those round and those not round.\n",
    "\n",
    "- Apply a median function to the intensity in these split data sets.\n",
    "\n",
    "- Combine the results of these averages on the split data set into a new, summary data set that contains the two classes (round and not round) and medians for each.\n",
    "\n",
    "We see that the strategy we want is a split-apply-combine strategy. This idea was put forward by Hadley Wickham in [this paper](https://www.jstatsoft.org/article/view/v040i01). It turns out that this is a strategy we want to use very often. Split the data in terms of some criterion. Apply some function to the split-up data. Combine the results into a new data frame.\n",
    "\n",
    "Note that if the data are tidy, this procedure makes a lot of sense. Choose the column you want to use to split by. All rows with like entries in the splitting column are then grouped into a new data set. You can then apply any function you want into these new data sets. You can then combine the results into a new data frame.\n",
    "\n",
    "Pandas’s split-apply-combine operations are achieved using the groupby() method. You can think of groupby() as the splitting part. You can then apply functions to the resulting DataFrameGroupBy object. The Pandas documentation on split-apply-combine is excellent and worth reading through. It is extensive though, so don’t let yourself get intimidated by it."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab72dfca-5ea2-4a8c-9b26-4e6afdef4c27",
   "metadata": {},
   "source": [
    "Before all that, we create a new column with our cirterion for roundness."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e6cd8e95-a0f7-47e5-adb7-2a57fd765f5f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>intensity_mean</th>\n",
       "      <th>major_axis_length</th>\n",
       "      <th>minor_axis_length</th>\n",
       "      <th>aspect_ratio</th>\n",
       "      <th>file_name</th>\n",
       "      <th>round</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>139</td>\n",
       "      <td>96.546763</td>\n",
       "      <td>17.504104</td>\n",
       "      <td>10.292770</td>\n",
       "      <td>1.700621</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>360</td>\n",
       "      <td>86.613889</td>\n",
       "      <td>35.746808</td>\n",
       "      <td>14.983124</td>\n",
       "      <td>2.385805</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>43</td>\n",
       "      <td>91.488372</td>\n",
       "      <td>12.967884</td>\n",
       "      <td>4.351573</td>\n",
       "      <td>2.980045</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>140</td>\n",
       "      <td>73.742857</td>\n",
       "      <td>18.940508</td>\n",
       "      <td>10.314404</td>\n",
       "      <td>1.836316</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>144</td>\n",
       "      <td>89.375000</td>\n",
       "      <td>13.639308</td>\n",
       "      <td>13.458532</td>\n",
       "      <td>1.013432</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>305</td>\n",
       "      <td>88.252459</td>\n",
       "      <td>20.226532</td>\n",
       "      <td>19.244210</td>\n",
       "      <td>1.051045</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>593</td>\n",
       "      <td>89.905565</td>\n",
       "      <td>36.508370</td>\n",
       "      <td>21.365394</td>\n",
       "      <td>1.708762</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>289</td>\n",
       "      <td>106.851211</td>\n",
       "      <td>20.427809</td>\n",
       "      <td>18.221452</td>\n",
       "      <td>1.121086</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>277</td>\n",
       "      <td>100.664260</td>\n",
       "      <td>20.307965</td>\n",
       "      <td>17.432920</td>\n",
       "      <td>1.164920</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>46</td>\n",
       "      <td>70.869565</td>\n",
       "      <td>11.648895</td>\n",
       "      <td>5.298003</td>\n",
       "      <td>2.198733</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>111 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     area  intensity_mean  major_axis_length  minor_axis_length  aspect_ratio  \\\n",
       "0     139       96.546763          17.504104          10.292770      1.700621   \n",
       "1     360       86.613889          35.746808          14.983124      2.385805   \n",
       "2      43       91.488372          12.967884           4.351573      2.980045   \n",
       "3     140       73.742857          18.940508          10.314404      1.836316   \n",
       "4     144       89.375000          13.639308          13.458532      1.013432   \n",
       "..    ...             ...                ...                ...           ...   \n",
       "106   305       88.252459          20.226532          19.244210      1.051045   \n",
       "107   593       89.905565          36.508370          21.365394      1.708762   \n",
       "108   289      106.851211          20.427809          18.221452      1.121086   \n",
       "109   277      100.664260          20.307965          17.432920      1.164920   \n",
       "110    46       70.869565          11.648895           5.298003      2.198733   \n",
       "\n",
       "              file_name  round  \n",
       "0    20P1_POS0010_D_1UL  False  \n",
       "1    20P1_POS0010_D_1UL  False  \n",
       "2    20P1_POS0010_D_1UL  False  \n",
       "3    20P1_POS0010_D_1UL  False  \n",
       "4    20P1_POS0010_D_1UL   True  \n",
       "..                  ...    ...  \n",
       "106  20P1_POS0007_D_1UL   True  \n",
       "107  20P1_POS0007_D_1UL  False  \n",
       "108  20P1_POS0007_D_1UL   True  \n",
       "109  20P1_POS0007_D_1UL   True  \n",
       "110  20P1_POS0007_D_1UL  False  \n",
       "\n",
       "[111 rows x 7 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['round'] = df['aspect_ratio'] < 1.2\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "464afbc4-287e-4ebc-9b29-dc80fe4f8805",
   "metadata": {},
   "source": [
    "## Aggregation: median intensity"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ef04e5ff-02a2-4b72-aab6-e596dcb63eda",
   "metadata": {},
   "source": [
    "Let's start by grouping by 'round'."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "37aaf674-fcc8-4ff1-96e2-216a0135fe10",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001788C205760>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped = df.groupby('round')\n",
    "\n",
    "# Take a look\n",
    "grouped"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "00eac35a-50a6-4085-b94e-6c79592c5ef5",
   "metadata": {},
   "source": [
    "There is not much to see in the DataFrameGroupBy object that resulted. But there is a lot we can do with this object. Typing `grouped.` and hitting tab will show you the many possibilities. For most of these possibilities, the apply and combine steps happen together and a new data frame is returned. The grouped.median() method is exactly what we want."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "14983422-5c93-431e-a105-7eb90f739372",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>intensity_mean</th>\n",
       "      <th>major_axis_length</th>\n",
       "      <th>minor_axis_length</th>\n",
       "      <th>aspect_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>round</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>False</th>\n",
       "      <td>270.0</td>\n",
       "      <td>92.788345</td>\n",
       "      <td>21.459495</td>\n",
       "      <td>15.858324</td>\n",
       "      <td>1.412849</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>291.0</td>\n",
       "      <td>100.256000</td>\n",
       "      <td>20.155547</td>\n",
       "      <td>18.352287</td>\n",
       "      <td>1.101700</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        area  intensity_mean  major_axis_length  minor_axis_length  \\\n",
       "round                                                                \n",
       "False  270.0       92.788345          21.459495          15.858324   \n",
       "True   291.0      100.256000          20.155547          18.352287   \n",
       "\n",
       "       aspect_ratio  \n",
       "round                \n",
       "False      1.412849  \n",
       "True       1.101700  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_median = grouped.median(numeric_only = True)\n",
    "\n",
    "# Take a look\n",
    "df_median"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cfd9ac1c-b987-444b-96cf-aad7acc82cab",
   "metadata": {},
   "source": [
    "Here the numeric_only option is set to disconsider for now calculating a median for the categorical 'file_name' column."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "339a3a54-7d67-427b-9a74-c62239040959",
   "metadata": {},
   "source": [
    "The outputted data frame has the medians of all quantities, including the intensities that we wanted. Note that this data frame has 'round' as the name of the row index. If we want to instead keep 'round' (which, remember, is what we used to split up the data set before we computed the summary statistics) as a column, we can use the reset_index() method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "103b54c4-24a1-487a-a169-4f4fc535ddf7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>round</th>\n",
       "      <th>area</th>\n",
       "      <th>intensity_mean</th>\n",
       "      <th>major_axis_length</th>\n",
       "      <th>minor_axis_length</th>\n",
       "      <th>aspect_ratio</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>270.0</td>\n",
       "      <td>92.788345</td>\n",
       "      <td>21.459495</td>\n",
       "      <td>15.858324</td>\n",
       "      <td>1.412849</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>True</td>\n",
       "      <td>291.0</td>\n",
       "      <td>100.256000</td>\n",
       "      <td>20.155547</td>\n",
       "      <td>18.352287</td>\n",
       "      <td>1.101700</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   round   area  intensity_mean  major_axis_length  minor_axis_length  \\\n",
       "0  False  270.0       92.788345          21.459495          15.858324   \n",
       "1   True  291.0      100.256000          20.155547          18.352287   \n",
       "\n",
       "   aspect_ratio  \n",
       "0      1.412849  \n",
       "1      1.101700  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_median.reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c01ca9fd-eb54-4ebd-ba73-e0318b2d143d",
   "metadata": {},
   "source": [
    "Note, though, that this was not done in-place. df_median still has an index labeled 'round'. If you want to update your data frame, you have to explicitly do so with an assignment operator."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "199fe5d5-e7ea-4956-be6c-76fece4ae370",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_median = df_median.reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "588cf301-c855-4905-a565-caa307ec2dfe",
   "metadata": {},
   "source": [
    "We can also use multiple columns in our groupby() operation. For example, we may wish to look at four groups, round from dirst file, round from second file, not round from first file, and not round from second file. To do this, we simply pass in a list of columns into df.groupby(). We will chain the methods, performing a groupby, applying a median, and then resetting the index of the result, all in one line."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e5880741-5332-431d-89af-0e332f8b984d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>round</th>\n",
       "      <th>file_name</th>\n",
       "      <th>area</th>\n",
       "      <th>intensity_mean</th>\n",
       "      <th>major_axis_length</th>\n",
       "      <th>minor_axis_length</th>\n",
       "      <th>aspect_ratio</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "      <td>323.0</td>\n",
       "      <td>91.796791</td>\n",
       "      <td>23.755227</td>\n",
       "      <td>17.072477</td>\n",
       "      <td>1.467410</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "      <td>237.0</td>\n",
       "      <td>93.269113</td>\n",
       "      <td>20.410737</td>\n",
       "      <td>14.832035</td>\n",
       "      <td>1.353858</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>True</td>\n",
       "      <td>20P1_POS0007_D_1UL</td>\n",
       "      <td>293.0</td>\n",
       "      <td>98.227799</td>\n",
       "      <td>20.307965</td>\n",
       "      <td>18.599043</td>\n",
       "      <td>1.101700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>True</td>\n",
       "      <td>20P1_POS0010_D_1UL</td>\n",
       "      <td>277.5</td>\n",
       "      <td>103.299825</td>\n",
       "      <td>19.662330</td>\n",
       "      <td>17.680741</td>\n",
       "      <td>1.103133</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   round           file_name   area  intensity_mean  major_axis_length  \\\n",
       "0  False  20P1_POS0007_D_1UL  323.0       91.796791          23.755227   \n",
       "1  False  20P1_POS0010_D_1UL  237.0       93.269113          20.410737   \n",
       "2   True  20P1_POS0007_D_1UL  293.0       98.227799          20.307965   \n",
       "3   True  20P1_POS0010_D_1UL  277.5      103.299825          19.662330   \n",
       "\n",
       "   minor_axis_length  aspect_ratio  \n",
       "0          17.072477      1.467410  \n",
       "1          14.832035      1.353858  \n",
       "2          18.599043      1.101700  \n",
       "3          17.680741      1.103133  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(['round', 'file_name']).median().reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5839d877-f17a-4c5c-8a3a-a3e0d5d537c4",
   "metadata": {},
   "source": [
    "This type of operation is called an aggregation. That is, we split the data set up into groups, and then computed a summary statistic for each group, in this case the median."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b6130950-ff18-4487-ab18-9fb05588d407",
   "metadata": {},
   "source": [
    "## Aggregating with custom functions"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f4a4eeab-fd4e-43fc-b2d5-26d0c21a04e7",
   "metadata": {},
   "source": [
    "If we want to apply a function that is not built-in, we can also do so. For example, let's apply the coefficient of variance. We can define a generic function that calculates it like shown below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e86a620d-9b0c-4255-b66a-4c3dea178dcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def coefficient_of_variance(data):\n",
    "    \"\"\"Compute coefficient of variation from an array of data.\"\"\"\n",
    "    return np.std(data) / np.mean(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "05d72d03-18dd-409f-8034-2fb2641b02b7",
   "metadata": {},
   "source": [
    "Now we group it and apply it as an aggregating function. If there are other categorical variables, they should be masked."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a0c01a3c-6064-4764-8772-78b52a334229",
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped = df.groupby(['round', 'file_name'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "cbc2b527-219c-4e94-a153-a7b785cdce48",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>area</th>\n",
       "      <th>intensity_mean</th>\n",
       "      <th>major_axis_length</th>\n",
       "      <th>minor_axis_length</th>\n",
       "      <th>aspect_ratio</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>round</th>\n",
       "      <th>file_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
       "      <th>20P1_POS0007_D_1UL</th>\n",
       "      <td>0.540399</td>\n",
       "      <td>0.145956</td>\n",
       "      <td>0.349857</td>\n",
       "      <td>0.289063</td>\n",
       "      <td>0.243450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20P1_POS0010_D_1UL</th>\n",
       "      <td>0.765156</td>\n",
       "      <td>0.143506</td>\n",
       "      <td>0.447638</td>\n",
       "      <td>0.402708</td>\n",
       "      <td>0.316206</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
       "      <th>20P1_POS0007_D_1UL</th>\n",
       "      <td>0.248799</td>\n",
       "      <td>0.099636</td>\n",
       "      <td>0.145247</td>\n",
       "      <td>0.147626</td>\n",
       "      <td>0.036950</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20P1_POS0010_D_1UL</th>\n",
       "      <td>0.275120</td>\n",
       "      <td>0.107008</td>\n",
       "      <td>0.167722</td>\n",
       "      <td>0.166214</td>\n",
       "      <td>0.043755</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              area  intensity_mean  major_axis_length  \\\n",
       "round file_name                                                         \n",
       "False 20P1_POS0007_D_1UL  0.540399        0.145956           0.349857   \n",
       "      20P1_POS0010_D_1UL  0.765156        0.143506           0.447638   \n",
       "True  20P1_POS0007_D_1UL  0.248799        0.099636           0.145247   \n",
       "      20P1_POS0010_D_1UL  0.275120        0.107008           0.167722   \n",
       "\n",
       "                          minor_axis_length  aspect_ratio  \n",
       "round file_name                                            \n",
       "False 20P1_POS0007_D_1UL           0.289063      0.243450  \n",
       "      20P1_POS0010_D_1UL           0.402708      0.316206  \n",
       "True  20P1_POS0007_D_1UL           0.147626      0.036950  \n",
       "      20P1_POS0010_D_1UL           0.166214      0.043755  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped.agg(coefficient_of_variance)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "88a608f2-0337-4ebe-bf1b-ff0b46def99e",
   "metadata": {},
   "source": [
    "## Exercise 1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e45e410-c69b-41b0-b11e-e939d35e80df",
   "metadata": {},
   "source": [
    "From the tidy dataframe from the previous notebook, group them by 'Intervention' and 'Channel' and display summary statistics for intensity."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1b8769e5-0287-4fcc-b0d1-8ab9884c2c9d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Intervention</th>\n",
       "      <th>Channel</th>\n",
       "      <th>intensity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Before</td>\n",
       "      <td>channel_1</td>\n",
       "      <td>13.250000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Before</td>\n",
       "      <td>channel_1</td>\n",
       "      <td>44.954545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Before</td>\n",
       "      <td>channel_1</td>\n",
       "      <td>13.590909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Before</td>\n",
       "      <td>channel_1</td>\n",
       "      <td>85.032258</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Before</td>\n",
       "      <td>channel_1</td>\n",
       "      <td>10.731707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99</th>\n",
       "      <td>After</td>\n",
       "      <td>channel_2</td>\n",
       "      <td>73.286439</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>After</td>\n",
       "      <td>channel_2</td>\n",
       "      <td>145.900739</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>After</td>\n",
       "      <td>channel_2</td>\n",
       "      <td>115.347217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>After</td>\n",
       "      <td>channel_2</td>\n",
       "      <td>61.225962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>After</td>\n",
       "      <td>channel_2</td>\n",
       "      <td>77.490249</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>104 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    Intervention    Channel   intensity\n",
       "0         Before  channel_1   13.250000\n",
       "1         Before  channel_1   44.954545\n",
       "2         Before  channel_1   13.590909\n",
       "3         Before  channel_1   85.032258\n",
       "4         Before  channel_1   10.731707\n",
       "..           ...        ...         ...\n",
       "99         After  channel_2   73.286439\n",
       "100        After  channel_2  145.900739\n",
       "101        After  channel_2  115.347217\n",
       "102        After  channel_2   61.225962\n",
       "103        After  channel_2   77.490249\n",
       "\n",
       "[104 rows x 3 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('../../data/Multi_analysis.csv', header = [0,1], sep=';')\n",
    "df_tidy = df.melt(value_name='intensity', var_name=['Intervention', 'Channel'])\n",
    "df_tidy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7abd7722-9eb2-4acc-81b7-3f9dceeac73a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "a18125db-dc09-4107-b417-3f7dcc7cf0f1",
   "metadata": {},
   "source": [
    "## Exercise 2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e2cb6ad-a0c0-4dd6-b82b-ff1ed27e5793",
   "metadata": {},
   "source": [
    "Calculate the skewness of the intensities grouped by channel and intervention. \n",
    "\n",
    "*Hint: use the function [skew](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.skew.html?highlight=skew#scipy.stats.skew) from scipy.stats and use the `.agg` method.*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b95c18f8-67cd-484f-81bb-4cea87d75950",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}