From 1986f15638554e2ebcd621753c531b73ae1c2f75 Mon Sep 17 00:00:00 2001
From: karthik270403 <143382518+karthik270403@users.noreply.github.com>
Date: Mon, 17 Jun 2024 14:28:51 +0530
Subject: [PATCH] Created using Colab
---
data_cleaning_using_numpy_and_pandas.ipynb | 4320 ++++++++++++++++++++
1 file changed, 4320 insertions(+)
create mode 100644 data_cleaning_using_numpy_and_pandas.ipynb
diff --git a/data_cleaning_using_numpy_and_pandas.ipynb b/data_cleaning_using_numpy_and_pandas.ipynb
new file mode 100644
index 0000000000000..f6280bac41c76
--- /dev/null
+++ b/data_cleaning_using_numpy_and_pandas.ipynb
@@ -0,0 +1,4320 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "authorship_tag": "ABX9TyMgQy7SQ2SxtbxPZQQwGG/Q",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "V1wBFlZtCTEq"
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = pd.read_csv(\"/content/Data-cleaning-for-beginners-using-pandas.csv\",index_col=0)"
+ ],
+ "metadata": {
+ "id": "-nhyZMBWtjqL"
+ },
+ "execution_count": 3,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 237
+ },
+ "id": "9v5VHadhubdc",
+ "outputId": "4363d5f9-ad8a-4248-cf75-5212896f3ce2"
+ },
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Age Salary Rating Location Established Easy Apply\n",
+ "Index \n",
+ "0 44.0 $44k-$99k 5.4 India,In 1999 TRUE\n",
+ "1 66.0 $55k-$66k 3.5 New York,Ny 2002 TRUE\n",
+ "2 NaN $77k-$89k -1.0 New York,Ny -1 -1\n",
+ "3 64.0 $44k-$99k 4.4 India In 1988 -1\n",
+ "4 25.0 $44k-$99k 6.4 Australia Aus 2002 -1"
+ ],
+ "text/html": [
+ "\n",
+ "
\n", + " | Age | \n", + "Salary | \n", + "Rating | \n", + "Location | \n", + "Established | \n", + "Easy Apply | \n", + "
---|---|---|---|---|---|---|
Index | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
0 | \n", + "44.0 | \n", + "$44k-$99k | \n", + "5.4 | \n", + "India,In | \n", + "1999 | \n", + "TRUE | \n", + "
1 | \n", + "66.0 | \n", + "$55k-$66k | \n", + "3.5 | \n", + "New York,Ny | \n", + "2002 | \n", + "TRUE | \n", + "
2 | \n", + "NaN | \n", + "$77k-$89k | \n", + "-1.0 | \n", + "New York,Ny | \n", + "-1 | \n", + "-1 | \n", + "
3 | \n", + "64.0 | \n", + "$44k-$99k | \n", + "4.4 | \n", + "India In | \n", + "1988 | \n", + "-1 | \n", + "
4 | \n", + "25.0 | \n", + "$44k-$99k | \n", + "6.4 | \n", + "Australia Aus | \n", + "2002 | \n", + "-1 | \n", + "
\n", + " | Age | \n", + "Salary | \n", + "Rating | \n", + "Location | \n", + "Established | \n", + "Easy Apply | \n", + "
---|---|---|---|---|---|---|
Index | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
24 | \n", + "13.0 | \n", + "$44k-$99k | \n", + "-1.0 | \n", + "New York,Ny | \n", + "1987 | \n", + "-1 | \n", + "
25 | \n", + "55.0 | \n", + "$44k-$99k | \n", + "0.0 | \n", + "Australia Aus | \n", + "1980 | \n", + "TRUE | \n", + "
26 | \n", + "NaN | \n", + "$55k-$66k | \n", + "NaN | \n", + "India,In | \n", + "1934 | \n", + "TRUE | \n", + "
27 | \n", + "52.0 | \n", + "$44k-$99k | \n", + "5.4 | \n", + "India,In | \n", + "1935 | \n", + "-1 | \n", + "
28 | \n", + "NaN | \n", + "$39k-$88k | \n", + "3.4 | \n", + "Australia Aus | \n", + "1932 | \n", + "-1 | \n", + "
\n", + " | Age | \n", + "Rating | \n", + "Established | \n", + "
---|---|---|---|
count | \n", + "22.000000 | \n", + "28.000000 | \n", + "29.000000 | \n", + "
mean | \n", + "39.045455 | \n", + "3.528571 | \n", + "1638.620690 | \n", + "
std | \n", + "16.134781 | \n", + "2.825133 | \n", + "762.079599 | \n", + "
min | \n", + "13.000000 | \n", + "-1.000000 | \n", + "-1.000000 | \n", + "
25% | \n", + "25.000000 | \n", + "1.050000 | \n", + "1935.000000 | \n", + "
50% | \n", + "39.500000 | \n", + "4.200000 | \n", + "1984.000000 | \n", + "
75% | \n", + "50.000000 | \n", + "5.400000 | \n", + "1999.000000 | \n", + "
max | \n", + "66.000000 | \n", + "7.800000 | \n", + "2020.000000 | \n", + "
\n", + " | age | \n", + "salary | \n", + "rating | \n", + "location | \n", + "established | \n", + "easy_apply | \n", + "
---|---|---|---|---|---|---|
Index | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
0 | \n", + "44.0 | \n", + "$44k-$99k | \n", + "5.4 | \n", + "India,In | \n", + "1999 | \n", + "TRUE | \n", + "
1 | \n", + "66.0 | \n", + "$55k-$66k | \n", + "3.5 | \n", + "New York,Ny | \n", + "2002 | \n", + "TRUE | \n", + "
2 | \n", + "39.0 | \n", + "$77k-$89k | \n", + "-1.0 | \n", + "New York,Ny | \n", + "-1 | \n", + "-1 | \n", + "
3 | \n", + "64.0 | \n", + "$44k-$99k | \n", + "4.4 | \n", + "India In | \n", + "1988 | \n", + "-1 | \n", + "
4 | \n", + "25.0 | \n", + "$44k-$99k | \n", + "6.4 | \n", + "Australia Aus | \n", + "2002 | \n", + "-1 | \n", + "
pandas.core.generic.NDFrame.astype
def astype(dtype, copy: bool_t | None=None, errors: IgnoreRaise='raise') -> NDFrameT
\n", + " \n", + " Cast a pandas object to a specified dtype ``dtype``.\n", + "\n", + "Parameters\n", + "----------\n", + "dtype : str, data type, Series or Mapping of column name -> data type\n", + " Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to\n", + " cast entire pandas object to the same type. Alternatively, use a\n", + " mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is\n", + " a numpy.dtype or Python type to cast one or more of the DataFrame's\n", + " columns to column-specific types.\n", + "copy : bool, default True\n", + " Return a copy when ``copy=True`` (be very careful setting\n", + " ``copy=False`` as changes to values then may propagate to other\n", + " pandas objects).\n", + "errors : {'raise', 'ignore'}, default 'raise'\n", + " Control raising of exceptions on invalid data for provided dtype.\n", + "\n", + " - ``raise`` : allow exceptions to be raised\n", + " - ``ignore`` : suppress exceptions. On error return original object.\n", + "\n", + "Returns\n", + "-------\n", + "same type as caller\n", + "\n", + "See Also\n", + "--------\n", + "to_datetime : Convert argument to datetime.\n", + "to_timedelta : Convert argument to timedelta.\n", + "to_numeric : Convert argument to a numeric type.\n", + "numpy.ndarray.astype : Cast a numpy array to a specified type.\n", + "\n", + "Notes\n", + "-----\n", + ".. versionchanged:: 2.0.0\n", + "\n", + " Using ``astype`` to convert from timezone-naive dtype to\n", + " timezone-aware dtype will raise an exception.\n", + " Use :meth:`Series.dt.tz_localize` instead.\n", + "\n", + "Examples\n", + "--------\n", + "Create a DataFrame:\n", + "\n", + ">>> d = {'col1': [1, 2], 'col2': [3, 4]}\n", + ">>> df = pd.DataFrame(data=d)\n", + ">>> df.dtypes\n", + "col1 int64\n", + "col2 int64\n", + "dtype: object\n", + "\n", + "Cast all columns to int32:\n", + "\n", + ">>> df.astype('int32').dtypes\n", + "col1 int32\n", + "col2 int32\n", + "dtype: object\n", + "\n", + "Cast col1 to int32 using a dictionary:\n", + "\n", + ">>> df.astype({'col1': 'int32'}).dtypes\n", + "col1 int32\n", + "col2 int64\n", + "dtype: object\n", + "\n", + "Create a series:\n", + "\n", + ">>> ser = pd.Series([1, 2], dtype='int32')\n", + ">>> ser\n", + "0 1\n", + "1 2\n", + "dtype: int32\n", + ">>> ser.astype('int64')\n", + "0 1\n", + "1 2\n", + "dtype: int64\n", + "\n", + "Convert to categorical type:\n", + "\n", + ">>> ser.astype('category')\n", + "0 1\n", + "1 2\n", + "dtype: category\n", + "Categories (2, int32): [1, 2]\n", + "\n", + "Convert to ordered categorical type with custom ordering:\n", + "\n", + ">>> from pandas.api.types import CategoricalDtype\n", + ">>> cat_dtype = CategoricalDtype(\n", + "... categories=[2, 1], ordered=True)\n", + ">>> ser.astype(cat_dtype)\n", + "0 1\n", + "1 2\n", + "dtype: category\n", + "Categories (2, int64): [2 < 1]\n", + "\n", + "Create a series of dates:\n", + "\n", + ">>> ser_date = pd.Series(pd.date_range('20200101', periods=3))\n", + ">>> ser_date\n", + "0 2020-01-01\n", + "1 2020-01-02\n", + "2 2020-01-03\n", + "dtype: datetime64[ns]
\n", + " | age | \n", + "salary | \n", + "rating | \n", + "established | \n", + "easy_apply | \n", + "location_city | \n", + "city_sign | \n", + "
---|---|---|---|---|---|---|---|
Index | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
0 | \n", + "44.0 | \n", + "44-99 | \n", + "5.4 | \n", + "1999.0 | \n", + "TRUE | \n", + "India | \n", + "In | \n", + "
1 | \n", + "66.0 | \n", + "55-66 | \n", + "3.5 | \n", + "2002.0 | \n", + "TRUE | \n", + "New York | \n", + "Ny | \n", + "
2 | \n", + "39.0 | \n", + "77-89 | \n", + "4.3 | \n", + "Unknown | \n", + "False | \n", + "New York | \n", + "Ny | \n", + "
3 | \n", + "64.0 | \n", + "44-99 | \n", + "4.4 | \n", + "1988.0 | \n", + "False | \n", + "India In | \n", + "None | \n", + "
4 | \n", + "25.0 | \n", + "44-99 | \n", + "6.4 | \n", + "2002.0 | \n", + "False | \n", + "Australia | \n", + "Aus | \n", + "
5 | \n", + "44.0 | \n", + "77-89 | \n", + "1.4 | \n", + "1999.0 | \n", + "TRUE | \n", + "India | \n", + "In | \n", + "
6 | \n", + "21.0 | \n", + "44-99 | \n", + "0.0 | \n", + "Unknown | \n", + "False | \n", + "New York | \n", + "Ny | \n", + "
7 | \n", + "44.0 | \n", + "44-99 | \n", + "4.3 | \n", + "Unknown | \n", + "False | \n", + "Australia | \n", + "Aus | \n", + "
8 | \n", + "35.0 | \n", + "44-99 | \n", + "5.4 | \n", + "Unknown | \n", + "False | \n", + "New York | \n", + "Ny | \n", + "
9 | \n", + "22.0 | \n", + "44-99 | \n", + "7.7 | \n", + "Unknown | \n", + "TRUE | \n", + "India | \n", + "In | \n", + "
10 | \n", + "55.0 | \n", + "10-49 | \n", + "5.4 | \n", + "2008.0 | \n", + "TRUE | \n", + "India | \n", + "In | \n", + "
11 | \n", + "44.0 | \n", + "10-49 | \n", + "6.7 | \n", + "2009.0 | \n", + "False | \n", + "India | \n", + "In | \n", + "
12 | \n", + "39.0 | \n", + "44-99 | \n", + "0.0 | \n", + "1999.0 | \n", + "False | \n", + "India | \n", + "In | \n", + "
13 | \n", + "25.0 | \n", + "44-99 | \n", + "4.3 | \n", + "2019.0 | \n", + "TRUE | \n", + "Australia | \n", + "Aus | \n", + "
14 | \n", + "66.0 | \n", + "44-99 | \n", + "4.0 | \n", + "2020.0 | \n", + "TRUE | \n", + "Australia | \n", + "Aus | \n", + "
15 | \n", + "44.0 | \n", + "88-101 | \n", + "3.0 | \n", + "1999.0 | \n", + "False | \n", + "Australia | \n", + "Aus | \n", + "
16 | \n", + "19.0 | \n", + "19-40 | \n", + "4.5 | \n", + "1984.0 | \n", + "False | \n", + "India | \n", + "In | \n", + "
17 | \n", + "39.0 | \n", + "44-99 | \n", + "5.3 | \n", + "1943.0 | \n", + "TRUE | \n", + "New York | \n", + "Ny | \n", + "
18 | \n", + "35.0 | \n", + "44-99 | \n", + "6.7 | \n", + "1954.0 | \n", + "TRUE | \n", + "New York | \n", + "Ny | \n", + "
19 | \n", + "32.0 | \n", + "44-99 | \n", + "3.3 | \n", + "1955.0 | \n", + "TRUE | \n", + "New York | \n", + "Ny | \n", + "
20 | \n", + "39.0 | \n", + "44-99 | \n", + "5.7 | \n", + "1944.0 | \n", + "TRUE | \n", + "New York | \n", + "Ny | \n", + "
21 | \n", + "35.0 | \n", + "44-99 | \n", + "5.0 | \n", + "1946.0 | \n", + "False | \n", + "New York | \n", + "Ny | \n", + "
22 | \n", + "19.0 | \n", + "55-66 | \n", + "7.8 | \n", + "1988.0 | \n", + "TRUE | \n", + "New York | \n", + "Ny | \n", + "
23 | \n", + "39.0 | \n", + "44-99 | \n", + "2.4 | \n", + "1999.0 | \n", + "TRUE | \n", + "New York | \n", + "Ny | \n", + "
24 | \n", + "13.0 | \n", + "44-99 | \n", + "4.3 | \n", + "1987.0 | \n", + "False | \n", + "New York | \n", + "Ny | \n", + "
25 | \n", + "55.0 | \n", + "44-99 | \n", + "0.0 | \n", + "1980.0 | \n", + "TRUE | \n", + "Australia | \n", + "Aus | \n", + "
26 | \n", + "39.0 | \n", + "55-66 | \n", + "4.3 | \n", + "1934.0 | \n", + "TRUE | \n", + "India | \n", + "In | \n", + "
27 | \n", + "52.0 | \n", + "44-99 | \n", + "5.4 | \n", + "1935.0 | \n", + "False | \n", + "India | \n", + "In | \n", + "
28 | \n", + "39.0 | \n", + "39-88 | \n", + "3.4 | \n", + "1932.0 | \n", + "False | \n", + "Australia | \n", + "Aus | \n", + "
\n", + " | order_id | \n", + "quantity | \n", + "item_name | \n", + "choice_description | \n", + "item_price | \n", + "
---|---|---|---|---|---|
0 | \n", + "1 | \n", + "1 | \n", + "Chips and Fresh Tomato Salsa | \n", + "NaN | \n", + "$2.39 | \n", + "
1 | \n", + "1 | \n", + "1 | \n", + "Izze | \n", + "[Clementine] | \n", + "$3.39 | \n", + "
2 | \n", + "1 | \n", + "1 | \n", + "Nantucket Nectar | \n", + "[Apple] | \n", + "$3.39 | \n", + "
3 | \n", + "1 | \n", + "1 | \n", + "Chips and Tomatillo-Green Chili Salsa | \n", + "NaN | \n", + "$2.39 | \n", + "
4 | \n", + "2 | \n", + "2 | \n", + "Chicken Bowl | \n", + "[Tomatillo-Red Chili Salsa (Hot), [Black Beans... | \n", + "$16.98 | \n", + "
\n", + " | order_id | \n", + "quantity | \n", + "item_name | \n", + "choice_description | \n", + "item_price | \n", + "
---|---|---|---|---|---|
4617 | \n", + "1833 | \n", + "1 | \n", + "Steak Burrito | \n", + "[Fresh Tomato Salsa, [Rice, Black Beans, Sour ... | \n", + "$11.75 | \n", + "
4618 | \n", + "1833 | \n", + "1 | \n", + "Steak Burrito | \n", + "[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese... | \n", + "$11.75 | \n", + "
4619 | \n", + "1834 | \n", + "1 | \n", + "Chicken Salad Bowl | \n", + "[Fresh Tomato Salsa, [Fajita Vegetables, Pinto... | \n", + "$11.25 | \n", + "
4620 | \n", + "1834 | \n", + "1 | \n", + "Chicken Salad Bowl | \n", + "[Fresh Tomato Salsa, [Fajita Vegetables, Lettu... | \n", + "$8.75 | \n", + "
4621 | \n", + "1834 | \n", + "1 | \n", + "Chicken Salad Bowl | \n", + "[Fresh Tomato Salsa, [Fajita Vegetables, Pinto... | \n", + "$8.75 | \n", + "
\n", + " | order_id | \n", + "quantity | \n", + "choice_description | \n", + "item_price | \n", + "
---|---|---|---|---|
item_name | \n", + "\n", + " | \n", + " | \n", + " | \n", + " |
Chicken Bowl | \n", + "713926 | \n", + "761 | \n", + "[Tomatillo-Red Chili Salsa (Hot), [Black Beans... | \n", + "$16.98$10.98$11.25$8.75$8.49$11.25$8.75$8.75$8... | \n", + "
Chicken Burrito | \n", + "497303 | \n", + "591 | \n", + "[Tomatillo-Green Chili Salsa (Medium), [Pinto ... | \n", + "$8.49$8.49$10.98$8.49$10.98$10.98$8.75$10.98$8... | \n", + "
Chips and Guacamole | \n", + "449959 | \n", + "506 | \n", + "0 | \n", + "$4.45$4.45$4.45$4.45$4.45$3.99$4.45$3.99$4.45$... | \n", + "
Steak Burrito | \n", + "328437 | \n", + "386 | \n", + "[Tomatillo Red Chili Salsa, [Fajita Vegetables... | \n", + "$11.75$9.25$8.99$11.75$8.99$8.99$8.99$8.99$8.9... | \n", + "
Canned Soft Drink | \n", + "304753 | \n", + "351 | \n", + "[Coke][Sprite][Coke][Coke][Lemonade][Sprite][D... | \n", + "$1.25$1.25$1.25$1.25$1.25$1.25$1.25$1.25$1.25$... | \n", + "
\n", + " | order_id | \n", + "quantity | \n", + "item_name | \n", + "item_price | \n", + "
---|---|---|---|---|
choice_description | \n", + "\n", + " | \n", + " | \n", + " | \n", + " |
[Diet Coke] | \n", + "123455 | \n", + "159 | \n", + "Canned SodaCanned SodaCanned Soda6 Pack Soft D... | \n", + "$2.18$1.09$1.09$6.49$2.18$1.25$1.09$6.49$6.49$... | \n", + "
\n", + " | order_id | \n", + "quantity | \n", + "item_name | \n", + "choice_description | \n", + "item_price | \n", + "
---|---|---|---|---|---|
0 | \n", + "1 | \n", + "1 | \n", + "Chips and Fresh Tomato Salsa | \n", + "NaN | \n", + "2.39 | \n", + "
1 | \n", + "1 | \n", + "1 | \n", + "Izze | \n", + "[Clementine] | \n", + "3.39 | \n", + "
2 | \n", + "1 | \n", + "1 | \n", + "Nantucket Nectar | \n", + "[Apple] | \n", + "3.39 | \n", + "
3 | \n", + "1 | \n", + "1 | \n", + "Chips and Tomatillo-Green Chili Salsa | \n", + "NaN | \n", + "2.39 | \n", + "
4 | \n", + "2 | \n", + "2 | \n", + "Chicken Bowl | \n", + "[Tomatillo-Red Chili Salsa (Hot), [Black Beans... | \n", + "16.98 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
4617 | \n", + "1833 | \n", + "1 | \n", + "Steak Burrito | \n", + "[Fresh Tomato Salsa, [Rice, Black Beans, Sour ... | \n", + "11.75 | \n", + "
4618 | \n", + "1833 | \n", + "1 | \n", + "Steak Burrito | \n", + "[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese... | \n", + "11.75 | \n", + "
4619 | \n", + "1834 | \n", + "1 | \n", + "Chicken Salad Bowl | \n", + "[Fresh Tomato Salsa, [Fajita Vegetables, Pinto... | \n", + "11.25 | \n", + "
4620 | \n", + "1834 | \n", + "1 | \n", + "Chicken Salad Bowl | \n", + "[Fresh Tomato Salsa, [Fajita Vegetables, Lettu... | \n", + "8.75 | \n", + "
4621 | \n", + "1834 | \n", + "1 | \n", + "Chicken Salad Bowl | \n", + "[Fresh Tomato Salsa, [Fajita Vegetables, Pinto... | \n", + "8.75 | \n", + "
4622 rows × 5 columns
\n", + "