diff --git a/data_cleaning_using_numpy_and_pandas.ipynb b/data_cleaning_using_numpy_and_pandas.ipynb new file mode 100644 index 0000000000000..f6280bac41c76 --- /dev/null +++ b/data_cleaning_using_numpy_and_pandas.ipynb @@ -0,0 +1,4320 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyMgQy7SQ2SxtbxPZQQwGG/Q", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "V1wBFlZtCTEq" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv(\"/content/Data-cleaning-for-beginners-using-pandas.csv\",index_col=0)" + ], + "metadata": { + "id": "-nhyZMBWtjqL" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + }, + "id": "9v5VHadhubdc", + "outputId": "4363d5f9-ad8a-4248-cf75-5212896f3ce2" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Age Salary Rating Location Established Easy Apply\n", + "Index \n", + "0 44.0 $44k-$99k 5.4 India,In 1999 TRUE\n", + "1 66.0 $55k-$66k 3.5 New York,Ny 2002 TRUE\n", + "2 NaN $77k-$89k -1.0 New York,Ny -1 -1\n", + "3 64.0 $44k-$99k 4.4 India In 1988 -1\n", + "4 25.0 $44k-$99k 6.4 Australia Aus 2002 -1" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeSalaryRatingLocationEstablishedEasy Apply
Index
044.0$44k-$99k5.4India,In1999TRUE
166.0$55k-$66k3.5New York,Ny2002TRUE
2NaN$77k-$89k-1.0New York,Ny-1-1
364.0$44k-$99k4.4India In1988-1
425.0$44k-$99k6.4Australia Aus2002-1
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 29,\n \"fields\": [\n {\n \"column\": \"Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8,\n \"min\": 0,\n \"max\": 28,\n \"num_unique_values\": 29,\n \"samples\": [\n 27,\n 16,\n 12\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 16.134781338188905,\n \"min\": 13.0,\n \"max\": 66.0,\n \"num_unique_values\": 12,\n \"samples\": [\n 13.0,\n 32.0,\n 44.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Salary\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"$44k-$99k\",\n \"$55k-$66k\",\n \"$19k-$40k\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.8251328572963894,\n \"min\": -1.0,\n \"max\": 7.8,\n \"num_unique_values\": 19,\n \"samples\": [\n 5.4,\n 1.4,\n 4.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"New York,Ny\",\n \"Australia Aus\",\n \"India,In\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Established\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 762,\n \"min\": -1,\n \"max\": 2020,\n \"num_unique_values\": 19,\n \"samples\": [\n 1999,\n 2009,\n 1955\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Easy Apply\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"-1\",\n \"TRUE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.tail()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + }, + "id": "8RzlupuRusWh", + "outputId": "d897f41a-d1b6-499c-d403-33da59a2052a" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Age Salary Rating Location Established Easy Apply\n", + "Index \n", + "24 13.0 $44k-$99k -1.0 New York,Ny 1987 -1\n", + "25 55.0 $44k-$99k 0.0 Australia Aus 1980 TRUE\n", + "26 NaN $55k-$66k NaN India,In 1934 TRUE\n", + "27 52.0 $44k-$99k 5.4 India,In 1935 -1\n", + "28 NaN $39k-$88k 3.4 Australia Aus 1932 -1" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeSalaryRatingLocationEstablishedEasy Apply
Index
2413.0$44k-$99k-1.0New York,Ny1987-1
2555.0$44k-$99k0.0Australia Aus1980TRUE
26NaN$55k-$66kNaNIndia,In1934TRUE
2752.0$44k-$99k5.4India,In1935-1
28NaN$39k-$88k3.4Australia Aus1932-1
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 24,\n \"max\": 28,\n \"num_unique_values\": 5,\n \"samples\": [\n 25,\n 28,\n 26\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 23.430749027719962,\n \"min\": 13.0,\n \"max\": 55.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 13.0,\n 55.0,\n 52.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Salary\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"$44k-$99k\",\n \"$55k-$66k\",\n \"$39k-$88k\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.972653135948873,\n \"min\": -1.0,\n \"max\": 5.4,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.0,\n 3.4,\n -1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Location\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"New York,Ny\",\n \"Australia Aus\",\n \"India,In\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Established\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 27,\n \"min\": 1932,\n \"max\": 1987,\n \"num_unique_values\": 5,\n \"samples\": [\n 1980,\n 1932,\n 1934\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Easy Apply\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"TRUE\",\n \"-1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "o6bqIGDluyDA", + "outputId": "80ac0603-a693-4367-f542-0fa4ec2140e8" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(29, 6)" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "ZdfSEXzou5qP", + "outputId": "62eaa441-4c1f-4d96-fea8-239dd0335e29" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Age Rating Established\n", + "count 22.000000 28.000000 29.000000\n", + "mean 39.045455 3.528571 1638.620690\n", + "std 16.134781 2.825133 762.079599\n", + "min 13.000000 -1.000000 -1.000000\n", + "25% 25.000000 1.050000 1935.000000\n", + "50% 39.500000 4.200000 1984.000000\n", + "75% 50.000000 5.400000 1999.000000\n", + "max 66.000000 7.800000 2020.000000" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeRatingEstablished
count22.00000028.00000029.000000
mean39.0454553.5285711638.620690
std16.1347812.825133762.079599
min13.000000-1.000000-1.000000
25%25.0000001.0500001935.000000
50%39.5000004.2000001984.000000
75%50.0000005.4000001999.000000
max66.0000007.8000002020.000000
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 18.207749242087708,\n \"min\": 13.0,\n \"max\": 66.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 39.04545454545455,\n 39.5,\n 22.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 9.092553623477537,\n \"min\": -1.0,\n \"max\": 28.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 3.528571428571429,\n 4.2,\n 28.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Established\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 893.6288555568016,\n \"min\": -1.0,\n \"max\": 2020.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 1638.6206896551723,\n 1984.0,\n 29.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4K6lOBj6vAxm", + "outputId": "a3e28a4c-77b3-4b28-b3bc-36ede288e33d" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Index: 29 entries, 0 to 28\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Age 22 non-null float64\n", + " 1 Salary 29 non-null object \n", + " 2 Rating 28 non-null float64\n", + " 3 Location 29 non-null object \n", + " 4 Established 29 non-null int64 \n", + " 5 Easy Apply 29 non-null object \n", + "dtypes: float64(2), int64(1), object(3)\n", + "memory usage: 1.6+ KB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print('Total sum of Null values in data: ', df.isnull().sum().sum())\n", + "print()\n", + "print(df.isnull().sum())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "73pFmJYGvHBa", + "outputId": "a1bc326c-f8dc-4148-b85d-9cfebbb9a543" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Total sum of Null values in data: 8\n", + "\n", + "Age 7\n", + "Salary 0\n", + "Rating 1\n", + "Location 0\n", + "Established 0\n", + "Easy Apply 0\n", + "dtype: int64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.columns = df.columns.str.lower().str.replace(\" \", \"_\")\n", + "df.columns" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3FGQFdgEvbwJ", + "outputId": "a4770612-61ea-45d4-9014-f6d8c59e1e65" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['age', 'salary', 'rating', 'location', 'established', 'easy_apply'], dtype='object')" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(df.age.mean())\n", + "\n", + "df['age'] = df.age.fillna(df.age.mean())\n", + "df['age'] = df.age.round(decimals=0)\n", + "\n", + "print(df.age.isnull().sum())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZejgEqQ4wC0c", + "outputId": "31116a31-34b1-4483-819b-4e5aaa8f5bd8" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "39.04545454545455\n", + "0\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + }, + "id": "4pWnW5O3wdD8", + "outputId": "987a1e1a-eba3-409d-f072-dcaef0cade60" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " age salary rating location established easy_apply\n", + "Index \n", + "0 44.0 $44k-$99k 5.4 India,In 1999 TRUE\n", + "1 66.0 $55k-$66k 3.5 New York,Ny 2002 TRUE\n", + "2 39.0 $77k-$89k -1.0 New York,Ny -1 -1\n", + "3 64.0 $44k-$99k 4.4 India In 1988 -1\n", + "4 25.0 $44k-$99k 6.4 Australia Aus 2002 -1" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agesalaryratinglocationestablishedeasy_apply
Index
044.0$44k-$99k5.4India,In1999TRUE
166.0$55k-$66k3.5New York,Ny2002TRUE
239.0$77k-$89k-1.0New York,Ny-1-1
364.0$44k-$99k4.4India In1988-1
425.0$44k-$99k6.4Australia Aus2002-1
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 29,\n \"fields\": [\n {\n \"column\": \"Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8,\n \"min\": 0,\n \"max\": 28,\n \"num_unique_values\": 29,\n \"samples\": [\n 27,\n 16,\n 12\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13.973144544908509,\n \"min\": 13.0,\n \"max\": 66.0,\n \"num_unique_values\": 13,\n \"samples\": [\n 13.0,\n 19.0,\n 44.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"salary\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"$44k-$99k\",\n \"$55k-$66k\",\n \"$19k-$40k\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.8251328572963894,\n \"min\": -1.0,\n \"max\": 7.8,\n \"num_unique_values\": 19,\n \"samples\": [\n 5.4,\n 1.4,\n 4.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"New York,Ny\",\n \"Australia Aus\",\n \"India,In\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"established\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 762,\n \"min\": -1,\n \"max\": 2020,\n \"num_unique_values\": 19,\n \"samples\": [\n 1999,\n 2009,\n 1955\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"easy_apply\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"-1\",\n \"TRUE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df[\"established\"] = df[\"established\"].replace(-1,np.nan)\n", + "df[\"established\"].fillna('Unknown', inplace=True)" + ], + "metadata": { + "id": "0y-1hSsMwqtQ" + }, + "execution_count": 13, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.location.astype" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 186 + }, + "collapsed": true, + "id": "9CXJNAkGw-33", + "outputId": "4c904fb2-6e5e-444b-eb80-001f2a8cf5e4" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "
\n", + "
pandas.core.generic.NDFrame.astype
def astype(dtype, copy: bool_t | None=None, errors: IgnoreRaise='raise') -> NDFrameT
/usr/local/lib/python3.10/dist-packages/pandas/core/generic.pyCast a pandas object to a specified dtype ``dtype``.\n",
+              "\n",
+              "Parameters\n",
+              "----------\n",
+              "dtype : str, data type, Series or Mapping of column name -> data type\n",
+              "    Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to\n",
+              "    cast entire pandas object to the same type. Alternatively, use a\n",
+              "    mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is\n",
+              "    a numpy.dtype or Python type to cast one or more of the DataFrame's\n",
+              "    columns to column-specific types.\n",
+              "copy : bool, default True\n",
+              "    Return a copy when ``copy=True`` (be very careful setting\n",
+              "    ``copy=False`` as changes to values then may propagate to other\n",
+              "    pandas objects).\n",
+              "errors : {'raise', 'ignore'}, default 'raise'\n",
+              "    Control raising of exceptions on invalid data for provided dtype.\n",
+              "\n",
+              "    - ``raise`` : allow exceptions to be raised\n",
+              "    - ``ignore`` : suppress exceptions. On error return original object.\n",
+              "\n",
+              "Returns\n",
+              "-------\n",
+              "same type as caller\n",
+              "\n",
+              "See Also\n",
+              "--------\n",
+              "to_datetime : Convert argument to datetime.\n",
+              "to_timedelta : Convert argument to timedelta.\n",
+              "to_numeric : Convert argument to a numeric type.\n",
+              "numpy.ndarray.astype : Cast a numpy array to a specified type.\n",
+              "\n",
+              "Notes\n",
+              "-----\n",
+              ".. versionchanged:: 2.0.0\n",
+              "\n",
+              "    Using ``astype`` to convert from timezone-naive dtype to\n",
+              "    timezone-aware dtype will raise an exception.\n",
+              "    Use :meth:`Series.dt.tz_localize` instead.\n",
+              "\n",
+              "Examples\n",
+              "--------\n",
+              "Create a DataFrame:\n",
+              "\n",
+              ">>> d = {'col1': [1, 2], 'col2': [3, 4]}\n",
+              ">>> df = pd.DataFrame(data=d)\n",
+              ">>> df.dtypes\n",
+              "col1    int64\n",
+              "col2    int64\n",
+              "dtype: object\n",
+              "\n",
+              "Cast all columns to int32:\n",
+              "\n",
+              ">>> df.astype('int32').dtypes\n",
+              "col1    int32\n",
+              "col2    int32\n",
+              "dtype: object\n",
+              "\n",
+              "Cast col1 to int32 using a dictionary:\n",
+              "\n",
+              ">>> df.astype({'col1': 'int32'}).dtypes\n",
+              "col1    int32\n",
+              "col2    int64\n",
+              "dtype: object\n",
+              "\n",
+              "Create a series:\n",
+              "\n",
+              ">>> ser = pd.Series([1, 2], dtype='int32')\n",
+              ">>> ser\n",
+              "0    1\n",
+              "1    2\n",
+              "dtype: int32\n",
+              ">>> ser.astype('int64')\n",
+              "0    1\n",
+              "1    2\n",
+              "dtype: int64\n",
+              "\n",
+              "Convert to categorical type:\n",
+              "\n",
+              ">>> ser.astype('category')\n",
+              "0    1\n",
+              "1    2\n",
+              "dtype: category\n",
+              "Categories (2, int32): [1, 2]\n",
+              "\n",
+              "Convert to ordered categorical type with custom ordering:\n",
+              "\n",
+              ">>> from pandas.api.types import CategoricalDtype\n",
+              ">>> cat_dtype = CategoricalDtype(\n",
+              "...     categories=[2, 1], ordered=True)\n",
+              ">>> ser.astype(cat_dtype)\n",
+              "0    1\n",
+              "1    2\n",
+              "dtype: category\n",
+              "Categories (2, int64): [2 < 1]\n",
+              "\n",
+              "Create a series of dates:\n",
+              "\n",
+              ">>> ser_date = pd.Series(pd.date_range('20200101', periods=3))\n",
+              ">>> ser_date\n",
+              "0   2020-01-01\n",
+              "1   2020-01-02\n",
+              "2   2020-01-03\n",
+              "dtype: datetime64[ns]
\n", + " \n", + "
" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df['location']= df['location'].str.replace(\"Australia Aus\", \"Australia, Aus\")\n", + "df['location_city'] = df['location'].str.split(\",\", expand=True).get(0)\n", + "df['city_sign'] = df['location'].str.split(\",\", expand=True).get(1)\n", + "df.drop('location', inplace=True, axis=1)" + ], + "metadata": { + "id": "DyKI3GTIxlfw" + }, + "execution_count": 16, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['rating'] = df[\"rating\"].replace(-1,np.nan)\n", + "df['rating'] = df[\"rating\"].fillna(df['rating'].mean()).round(decimals=1)" + ], + "metadata": { + "id": "0qS9KbwEx5G7" + }, + "execution_count": 17, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['easy_apply'] = df['easy_apply'].replace(\"-1\", \"False\")" + ], + "metadata": { + "id": "NzF89RvByHRj" + }, + "execution_count": 18, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['salary'].iloc[0]\n", + "\n", + "df['salary'] = df['salary'].apply(lambda x:x.replace(\"$\",''))\n", + "df['salary'] = df['salary'].apply(lambda x:x.replace(\"k\",''))" + ], + "metadata": { + "id": "yW1W8vvdyQCZ" + }, + "execution_count": 19, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 990 + }, + "id": "AgKNGOGnyfhR", + "outputId": "421c7e78-01be-4142-ffc2-b81e086b5ae6" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " age salary rating established easy_apply location_city city_sign\n", + "Index \n", + "0 44.0 44-99 5.4 1999.0 TRUE India In\n", + "1 66.0 55-66 3.5 2002.0 TRUE New York Ny\n", + "2 39.0 77-89 4.3 Unknown False New York Ny\n", + "3 64.0 44-99 4.4 1988.0 False India In None\n", + "4 25.0 44-99 6.4 2002.0 False Australia Aus\n", + "5 44.0 77-89 1.4 1999.0 TRUE India In\n", + "6 21.0 44-99 0.0 Unknown False New York Ny\n", + "7 44.0 44-99 4.3 Unknown False Australia Aus\n", + "8 35.0 44-99 5.4 Unknown False New York Ny\n", + "9 22.0 44-99 7.7 Unknown TRUE India In\n", + "10 55.0 10-49 5.4 2008.0 TRUE India In\n", + "11 44.0 10-49 6.7 2009.0 False India In\n", + "12 39.0 44-99 0.0 1999.0 False India In\n", + "13 25.0 44-99 4.3 2019.0 TRUE Australia Aus\n", + "14 66.0 44-99 4.0 2020.0 TRUE Australia Aus\n", + "15 44.0 88-101 3.0 1999.0 False Australia Aus\n", + "16 19.0 19-40 4.5 1984.0 False India In\n", + "17 39.0 44-99 5.3 1943.0 TRUE New York Ny\n", + "18 35.0 44-99 6.7 1954.0 TRUE New York Ny\n", + "19 32.0 44-99 3.3 1955.0 TRUE New York Ny\n", + "20 39.0 44-99 5.7 1944.0 TRUE New York Ny\n", + "21 35.0 44-99 5.0 1946.0 False New York Ny\n", + "22 19.0 55-66 7.8 1988.0 TRUE New York Ny\n", + "23 39.0 44-99 2.4 1999.0 TRUE New York Ny\n", + "24 13.0 44-99 4.3 1987.0 False New York Ny\n", + "25 55.0 44-99 0.0 1980.0 TRUE Australia Aus\n", + "26 39.0 55-66 4.3 1934.0 TRUE India In\n", + "27 52.0 44-99 5.4 1935.0 False India In\n", + "28 39.0 39-88 3.4 1932.0 False Australia Aus" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agesalaryratingestablishedeasy_applylocation_citycity_sign
Index
044.044-995.41999.0TRUEIndiaIn
166.055-663.52002.0TRUENew YorkNy
239.077-894.3UnknownFalseNew YorkNy
364.044-994.41988.0FalseIndia InNone
425.044-996.42002.0FalseAustraliaAus
544.077-891.41999.0TRUEIndiaIn
621.044-990.0UnknownFalseNew YorkNy
744.044-994.3UnknownFalseAustraliaAus
835.044-995.4UnknownFalseNew YorkNy
922.044-997.7UnknownTRUEIndiaIn
1055.010-495.42008.0TRUEIndiaIn
1144.010-496.72009.0FalseIndiaIn
1239.044-990.01999.0FalseIndiaIn
1325.044-994.32019.0TRUEAustraliaAus
1466.044-994.02020.0TRUEAustraliaAus
1544.088-1013.01999.0FalseAustraliaAus
1619.019-404.51984.0FalseIndiaIn
1739.044-995.31943.0TRUENew YorkNy
1835.044-996.71954.0TRUENew YorkNy
1932.044-993.31955.0TRUENew YorkNy
2039.044-995.71944.0TRUENew YorkNy
2135.044-995.01946.0FalseNew YorkNy
2219.055-667.81988.0TRUENew YorkNy
2339.044-992.41999.0TRUENew YorkNy
2413.044-994.31987.0FalseNew YorkNy
2555.044-990.01980.0TRUEAustraliaAus
2639.055-664.31934.0TRUEIndiaIn
2752.044-995.41935.0FalseIndiaIn
2839.039-883.41932.0FalseAustraliaAus
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 29,\n \"fields\": [\n {\n \"column\": \"Index\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8,\n \"min\": 0,\n \"max\": 28,\n \"num_unique_values\": 29,\n \"samples\": [\n 27,\n 16,\n 12\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13.973144544908509,\n \"min\": 13.0,\n \"max\": 66.0,\n \"num_unique_values\": 13,\n \"samples\": [\n 13.0,\n 19.0,\n 44.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"salary\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"44-99\",\n \"55-66\",\n \"19-40\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.06842316372556,\n \"min\": 0.0,\n \"max\": 7.8,\n \"num_unique_values\": 19,\n \"samples\": [\n 5.4,\n 1.4,\n 4.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"established\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 19,\n \"samples\": [\n 1999.0,\n 2009.0,\n 1955.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"easy_apply\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"False\",\n \"TRUE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"location_city\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"New York\",\n \"Australia\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"city_sign\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"In\",\n \"Ny\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# CHIPOTLE\n" + ], + "metadata": { + "id": "_0Zqbm3lyt7a" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd" + ], + "metadata": { + "id": "Beh4q25hy1f0" + }, + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv(\"/content/Data Cleaning with numpy and pandas - Dataset 2.csv\")" + ], + "metadata": { + "id": "sG7kGBnQy55U" + }, + "execution_count": 34, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "KvsHzbI0zye4", + "outputId": "ff6fa303-574f-4487-f687-a9bf51c7471b" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " order_id quantity item_name \\\n", + "0 1 1 Chips and Fresh Tomato Salsa \n", + "1 1 1 Izze \n", + "2 1 1 Nantucket Nectar \n", + "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", + "4 2 2 Chicken Bowl \n", + "\n", + " choice_description item_price \n", + "0 NaN $2.39 \n", + "1 [Clementine] $3.39 \n", + "2 [Apple] $3.39 \n", + "3 NaN $2.39 \n", + "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 4622,\n \"fields\": [\n {\n \"column\": \"order_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 528,\n \"min\": 1,\n \"max\": 1834,\n \"num_unique_values\": 1834,\n \"samples\": [\n 644,\n 333,\n 991\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"quantity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 15,\n \"num_unique_values\": 9,\n \"samples\": [\n 8,\n 2,\n 15\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 50,\n \"samples\": [\n \"Barbacoa Burrito\",\n \"Crispy Tacos\",\n \"Chips and Roasted Chili Corn Salsa\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"choice_description\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1043,\n \"samples\": [\n \"[Roasted Chili Corn Salsa, [Black Beans, Sour Cream, Cheese, Guacamole]]\",\n \"[Tomatillo Red Chili Salsa, [Guacamole, Cheese]]\",\n \"[Fresh Tomato Salsa, [Rice, Cheese, Sour Cream, Guacamole, Lettuce]]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_price\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 78,\n \"samples\": [\n \"$23.50\",\n \"$2.39\",\n \"$7.40\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 35 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.tail()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "HsOpJJ6Fz5O-", + "outputId": "7044e0e2-1a62-494e-eb66-15d9185999e3" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " order_id quantity item_name \\\n", + "4617 1833 1 Steak Burrito \n", + "4618 1833 1 Steak Burrito \n", + "4619 1834 1 Chicken Salad Bowl \n", + "4620 1834 1 Chicken Salad Bowl \n", + "4621 1834 1 Chicken Salad Bowl \n", + "\n", + " choice_description item_price \n", + "4617 [Fresh Tomato Salsa, [Rice, Black Beans, Sour ... $11.75 \n", + "4618 [Fresh Tomato Salsa, [Rice, Sour Cream, Cheese... $11.75 \n", + "4619 [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $11.25 \n", + "4620 [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... $8.75 \n", + "4621 [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $8.75 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_namechoice_descriptionitem_price
461718331Steak Burrito[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...$11.75
461818331Steak Burrito[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...$11.75
461918341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...$11.25
462018341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...$8.75
462118341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...$8.75
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"order_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1833,\n \"max\": 1834,\n \"num_unique_values\": 2,\n \"samples\": [\n 1834,\n 1833\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"quantity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 1,\n \"num_unique_values\": 1,\n \"samples\": [\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Chicken Salad Bowl\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"choice_description\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese, Lettuce, Guacamole]]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_price\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"$11.75\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 36 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OIqh0nVE0DWq", + "outputId": "572515c9-80ea-4722-f21c-b73308a86b14" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(4622, 5)" + ] + }, + "metadata": {}, + "execution_count": 37 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QbIgh40J0GFj", + "outputId": "9ac73846-f6c9-43a7-867a-1343db15b0a5" + }, + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 4622 entries, 0 to 4621\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 order_id 4622 non-null int64 \n", + " 1 quantity 4622 non-null int64 \n", + " 2 item_name 4622 non-null object\n", + " 3 choice_description 3376 non-null object\n", + " 4 item_price 4622 non-null object\n", + "dtypes: int64(2), object(3)\n", + "memory usage: 180.7+ KB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.columns" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2tRUbb7A0Oej", + "outputId": "2fbb8dc5-84ea-433d-8f80-f12e0c1f1887" + }, + "execution_count": 39, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['order_id', 'quantity', 'item_name', 'choice_description',\n", + " 'item_price'],\n", + " dtype='object')" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "source": [ + "c = df.groupby('item_name').sum()\n", + "c = c.sort_values(['quantity'], ascending = False)\n", + "c.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + }, + "id": "ht5ydjj10Shj", + "outputId": "60b39bc7-f3fa-4722-dfd4-1ebd0557c467" + }, + "execution_count": 40, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " order_id quantity \\\n", + "item_name \n", + "Chicken Bowl 713926 761 \n", + "Chicken Burrito 497303 591 \n", + "Chips and Guacamole 449959 506 \n", + "Steak Burrito 328437 386 \n", + "Canned Soft Drink 304753 351 \n", + "\n", + " choice_description \\\n", + "item_name \n", + "Chicken Bowl [Tomatillo-Red Chili Salsa (Hot), [Black Beans... \n", + "Chicken Burrito [Tomatillo-Green Chili Salsa (Medium), [Pinto ... \n", + "Chips and Guacamole 0 \n", + "Steak Burrito [Tomatillo Red Chili Salsa, [Fajita Vegetables... \n", + "Canned Soft Drink [Coke][Sprite][Coke][Coke][Lemonade][Sprite][D... \n", + "\n", + " item_price \n", + "item_name \n", + "Chicken Bowl $16.98$10.98$11.25$8.75$8.49$11.25$8.75$8.75$8... \n", + "Chicken Burrito $8.49$8.49$10.98$8.49$10.98$10.98$8.75$10.98$8... \n", + "Chips and Guacamole $4.45$4.45$4.45$4.45$4.45$3.99$4.45$3.99$4.45$... \n", + "Steak Burrito $11.75$9.25$8.99$11.75$8.99$8.99$8.99$8.99$8.9... \n", + "Canned Soft Drink $1.25$1.25$1.25$1.25$1.25$1.25$1.25$1.25$1.25$... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantitychoice_descriptionitem_price
item_name
Chicken Bowl713926761[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98$10.98$11.25$8.75$8.49$11.25$8.75$8.75$8...
Chicken Burrito497303591[Tomatillo-Green Chili Salsa (Medium), [Pinto ...$8.49$8.49$10.98$8.49$10.98$10.98$8.75$10.98$8...
Chips and Guacamole4499595060$4.45$4.45$4.45$4.45$4.45$3.99$4.45$3.99$4.45$...
Steak Burrito328437386[Tomatillo Red Chili Salsa, [Fajita Vegetables...$11.75$9.25$8.99$11.75$8.99$8.99$8.99$8.99$8.9...
Canned Soft Drink304753351[Coke][Sprite][Coke][Coke][Lemonade][Sprite][D...$1.25$1.25$1.25$1.25$1.25$1.25$1.25$1.25$1.25$...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "c", + "summary": "{\n \"name\": \"c\",\n \"rows\": 50,\n \"fields\": [\n {\n \"column\": \"item_name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 50,\n \"samples\": [\n \"Veggie Burrito\",\n \"Veggie Soft Tacos\",\n \"Barbacoa Soft Tacos\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"order_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 142559,\n \"min\": 279,\n \"max\": 713926,\n \"num_unique_values\": 50,\n \"samples\": [\n 80962,\n 5520,\n 18725\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"quantity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 159,\n \"min\": 1,\n \"max\": 761,\n \"num_unique_values\": 40,\n \"samples\": [\n 56,\n 71,\n 87\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"choice_description\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 39,\n \"samples\": [\n \"[Braised Carnitas, Pinto Beans, [Sour Cream, Cheese, Cilantro-Lime Rice]][Adobo-Marinated and Grilled Steak, [Sour Cream, Salsa, Cheese, Cilantro-Lime Rice, Guacamole]]\",\n \"[Adobo-Marinated and Grilled Steak][Adobo-Marinated and Grilled Chicken]\",\n \"[Coke][Sprite][Coke][Coke][Lemonade][Sprite][Diet Coke][Coke][Coke][Coke][Sprite][Coke][Coke][Coke][Diet Coke][Lemonade][Coke][Diet Coke][Diet Coke][Diet Coke][Diet Coke][Nestea][Diet Coke][Coke][Coke][Lemonade][Coke][Coke][Diet Coke][Diet Coke][Lemonade][Lemonade][Diet Coke][Coke][Sprite][Coke][Sprite][Coke][Sprite][Nestea][Lemonade][Diet Coke][Coke][Sprite][Coke][Diet Coke][Sprite][Diet Coke][Coke][Lemonade][Lemonade][Diet Coke][Coke][Diet Coke][Diet Coke][Coke][Diet Coke][Coke][Coke][Diet Coke][Sprite][Diet Coke][Coke][Diet Coke][Diet Coke][Diet Coke][Diet Coke][Coke][Diet Coke][Diet Coke][Lemonade][Lemonade][Diet Coke][Coke][Diet Coke][Coke][Coke][Diet Coke][Coke][Coke][Lemonade][Coke][Diet Coke][Diet Coke][Diet Coke][Diet Coke][Diet Coke][Diet Coke][Coke][Nestea][Sprite][Diet Coke][Diet Coke][Lemonade][Coke][Sprite][Diet Coke][Coke][Coke][Lemonade][Diet Coke][Sprite][Diet Coke][Diet Coke][Lemonade][Sprite][Lemonade][Diet Coke][Lemonade][Sprite][Diet Coke][Coke][Coke][Coke][Coke][Diet Coke][Diet Coke][Sprite][Coke][Diet Coke][Coke][Lemonade][Diet Coke][Coke][Coke][Sprite][Diet Coke][Sprite][Diet Coke][Coke][Diet Coke][Sprite][Diet Coke][Sprite][Lemonade][Nestea][Diet Coke][Lemonade][Coke][Coke][Sprite][Sprite][Nestea][Nestea][Coke][Sprite][Coke][Coke][Diet Coke][Coke][Diet Coke][Sprite][Diet Coke][Coke][Sprite][Coke][Coke][Coke][Diet Coke][Diet Coke][Diet Coke][Nestea][Lemonade][Sprite][Diet Coke][Nestea][Diet Coke][Coke][Diet Coke][Sprite][Coke][Coke][Coke][Diet Coke][Coke][Diet Coke][Sprite][Diet Coke][Sprite][Coke][Diet Coke][Diet Coke][Coke][Diet Coke][Coke][Diet Coke][Coke][Diet Coke][Diet Coke][Sprite][Diet Coke][Sprite][Sprite][Nestea][Sprite][Sprite][Sprite][Nestea][Diet Coke][Diet Coke][Lemonade][Nestea][Coke][Diet Coke][Coke][Coke][Sprite][Diet Coke][Diet Coke][Coke][Coke][Sprite][Diet Coke][Diet Coke][Sprite][Coke][Diet Coke][Lemonade][Diet Coke][Sprite][Sprite][Diet Coke][Nestea][Coke][Coke][Lemonade][Nestea][Coke][Coke][Nestea][Nestea][Diet Coke][Lemonade][Coke][Coke][Diet Coke][Coke][Diet Coke][Lemonade][Coke][Lemonade][Sprite][Lemonade][Coke][Nestea][Coke][Sprite][Sprite][Diet Coke][Coke][Sprite][Coke][Coke][Coke][Diet Coke][Coke][Diet Coke][Coke][Diet Coke][Sprite][Nestea][Coke][Coke][Sprite][Coke][Coke][Coke][Lemonade][Sprite][Coke][Coke][Sprite][Coke][Diet Coke][Coke][Coke][Lemonade][Lemonade][Diet Coke][Diet Coke][Coke][Sprite][Lemonade][Sprite][Coke][Sprite][Diet Coke][Sprite][Sprite][Coke][Diet Coke][Sprite][Coke][Sprite][Diet Coke][Coke][Diet Coke][Nestea][Diet Coke][Coke][Sprite]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_price\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49,\n \"samples\": [\n \"$11.25$8.49$11.25$11.25$10.98$8.49$8.49$11.25$8.49$8.49$8.75$8.49$8.49$8.49$8.49$8.49$8.49$8.75$8.49$11.25$8.49$8.49$8.49$11.25$8.49$11.25$8.49$11.25$8.49$8.75$11.25$8.49$8.49$8.49$8.75$10.98$8.75$8.49$8.49$8.75$8.49$11.25$8.49$11.25$10.98$11.25$11.25$11.25$8.75$8.75$8.75$8.75$8.75$11.25$11.25$11.25$10.98$8.75$11.25$8.75$8.49$8.49$11.25$8.49$8.75$11.25$8.49$11.25$8.49$8.75$11.25$8.75$8.75$8.49$11.25$8.49$8.49$11.25$11.25$33.75$8.49$11.25$8.49$8.49$8.49$8.49$8.49$8.49$11.25$11.25$11.25$11.25$8.75$11.25$11.25\",\n \"$7.40$7.40\",\n \"$8.99\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 40 + } + ] + }, + { + "cell_type": "code", + "source": [ + "c = df.groupby('choice_description').sum()\n", + "c = c.sort_values(['quantity'], ascending = False)\n", + "c.head(1)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 + }, + "id": "FUPWaqBB2DyY", + "outputId": "765c1db5-6c5c-4950-8437-b45567b06e97" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " order_id quantity \\\n", + "choice_description \n", + "[Diet Coke] 123455 159 \n", + "\n", + " item_name \\\n", + "choice_description \n", + "[Diet Coke] Canned SodaCanned SodaCanned Soda6 Pack Soft D... \n", + "\n", + " item_price \n", + "choice_description \n", + "[Diet Coke] $2.18$1.09$1.09$6.49$2.18$1.25$1.09$6.49$6.49$... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_nameitem_price
choice_description
[Diet Coke]123455159Canned SodaCanned SodaCanned Soda6 Pack Soft D...$2.18$1.09$1.09$6.49$2.18$1.25$1.09$6.49$6.49$...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "c", + "summary": "{\n \"name\": \"c\",\n \"rows\": 1043,\n \"fields\": [\n {\n \"column\": \"choice_description\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1043,\n \"samples\": [\n \"[Roasted Chili Corn Salsa, [Fajita Vegetables, Rice, Pinto Beans, Cheese, Guacamole, Lettuce]]\",\n \"[Tomatillo-Red Chili Salsa (Hot), [Cheese, Sour Cream, Lettuce]]\",\n \"[[Fresh Tomato Salsa (Mild), Tomatillo-Green Chili Salsa (Medium), Roasted Chili Corn Salsa (Medium), Tomatillo-Red Chili Salsa (Hot)], [Pinto Beans, Black Beans, Rice, Cheese, Sour Cream]]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"order_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7177,\n \"min\": 4,\n \"max\": 123455,\n \"num_unique_values\": 922,\n \"samples\": [\n 628,\n 1607,\n 414\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"quantity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8,\n \"min\": 1,\n \"max\": 159,\n \"num_unique_values\": 34,\n \"samples\": [\n 20,\n 15,\n 7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 404,\n \"samples\": [\n \"Barbacoa Soft TacosBarbacoa Soft TacosBarbacoa Soft TacosBarbacoa Soft TacosBarbacoa Soft TacosCarnitas BurritoBarbacoa Soft TacosSteak Burrito\",\n \"Chicken Soft TacosSteak BowlVeggie BowlCarnitas Bowl\",\n \"Barbacoa Bowl\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_price\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 284,\n \"samples\": [\n \"$9.25$8.75$9.25$8.75$8.75$8.75$8.75$8.75$8.75$8.75$8.75$8.75$8.75$9.25$8.75$8.75$9.25$8.75$8.75$8.75$17.50$8.75$8.75$8.75$9.25$8.75$8.75$8.75$8.75\",\n \"$8.75$9.39\",\n \"$10.98$10.98$10.98$10.98$10.98\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 41 + } + ] + }, + { + "cell_type": "code", + "source": [ + "total_items_ordered = df.quantity.sum()\n", + "total_items_ordered" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "waQgPZVA2SGD", + "outputId": "11459347-e488-4aec-b723-50827938f911" + }, + "execution_count": 42, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "4972" + ] + }, + "metadata": {}, + "execution_count": 42 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df[\"item_price\"]= df[\"item_price\"].str.replace('$','').astype(float)\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 423 + }, + "id": "XI1BDzqX2TsH", + "outputId": "e223bc7a-e2dc-4faa-ea55-665e5be94517" + }, + "execution_count": 43, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " order_id quantity item_name \\\n", + "0 1 1 Chips and Fresh Tomato Salsa \n", + "1 1 1 Izze \n", + "2 1 1 Nantucket Nectar \n", + "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", + "4 2 2 Chicken Bowl \n", + "... ... ... ... \n", + "4617 1833 1 Steak Burrito \n", + "4618 1833 1 Steak Burrito \n", + "4619 1834 1 Chicken Salad Bowl \n", + "4620 1834 1 Chicken Salad Bowl \n", + "4621 1834 1 Chicken Salad Bowl \n", + "\n", + " choice_description item_price \n", + "0 NaN 2.39 \n", + "1 [Clementine] 3.39 \n", + "2 [Apple] 3.39 \n", + "3 NaN 2.39 \n", + "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... 16.98 \n", + "... ... ... \n", + "4617 [Fresh Tomato Salsa, [Rice, Black Beans, Sour ... 11.75 \n", + "4618 [Fresh Tomato Salsa, [Rice, Sour Cream, Cheese... 11.75 \n", + "4619 [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... 11.25 \n", + "4620 [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... 8.75 \n", + "4621 [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... 8.75 \n", + "\n", + "[4622 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN2.39
111Izze[Clementine]3.39
211Nantucket Nectar[Apple]3.39
311Chips and Tomatillo-Green Chili SalsaNaN2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...16.98
..................
461718331Steak Burrito[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...11.75
461818331Steak Burrito[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...11.75
461918341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...11.25
462018341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...8.75
462118341Chicken Salad Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...8.75
\n", + "

4622 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 4622,\n \"fields\": [\n {\n \"column\": \"order_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 528,\n \"min\": 1,\n \"max\": 1834,\n \"num_unique_values\": 1834,\n \"samples\": [\n 644,\n 333,\n 991\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"quantity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 15,\n \"num_unique_values\": 9,\n \"samples\": [\n 8,\n 2,\n 15\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 50,\n \"samples\": [\n \"Barbacoa Burrito\",\n \"Crispy Tacos\",\n \"Chips and Roasted Chili Corn Salsa\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"choice_description\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1043,\n \"samples\": [\n \"[Roasted Chili Corn Salsa, [Black Beans, Sour Cream, Cheese, Guacamole]]\",\n \"[Tomatillo Red Chili Salsa, [Guacamole, Cheese]]\",\n \"[Fresh Tomato Salsa, [Rice, Cheese, Sour Cream, Guacamole, Lettuce]]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4.245557197940302,\n \"min\": 1.09,\n \"max\": 44.25,\n \"num_unique_values\": 78,\n \"samples\": [\n 23.5,\n 2.39,\n 7.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 43 + } + ] + }, + { + "cell_type": "code", + "source": [ + "revene= df['item_price'].sum()\n", + "print('Revenue was: $',revene)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yXVc_PpF2moL", + "outputId": "5ae57e9e-ddeb-438c-b2ff-4488da2d86b4" + }, + "execution_count": 44, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Revenue was: $ 34500.16\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "orders = df.order_id.value_counts().count()\n", + "orders" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "66wd1ZXQ2pcU", + "outputId": "d8d1ed51-e78c-473c-fee7-9105597cdbf3" + }, + "execution_count": 45, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1834" + ] + }, + "metadata": {}, + "execution_count": 45 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Convert 'item_price' to string before using .str accessor\n", + "df[\"item_price\"] = df[\"item_price\"].astype(str).str.replace('[^\\d.]', '', regex=True).astype(float)\n", + "\n", + "# Calculate revenue for each item\n", + "df['revenue'] = df['quantity'] * df['item_price']\n", + "\n", + "# Group by order_id and calculate total revenue per order\n", + "order_grouped = df.groupby(by=['order_id']).sum()\n", + "\n", + "# Calculate the mean revenue per order\n", + "mean_revenue = order_grouped['revenue'].mean()\n", + "print(mean_revenue)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "r4AVZ_-D2-TD", + "outputId": "2e109bde-8582-4531-eb3c-447caaa456c4" + }, + "execution_count": 50, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "21.39423118865867\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.item_name.value_counts().count()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ICBq8jNz4QtY", + "outputId": "67406211-7361-4a0c-c70a-c437dd82ec52" + }, + "execution_count": 51, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "50" + ] + }, + "metadata": {}, + "execution_count": 51 + } + ] + } + ] +} \ No newline at end of file