|
| 1 | +''' |
| 2 | +from: https://codechalleng.es/bites/30/#console |
| 3 | +In this Bite we are going to parse a csv movie dataset to identify the directors with the highest rated movies. |
| 4 | +
|
| 5 | +Write get_movies_by_director: use csv.DictReader to convert movie_metadata.csv into a (default)dict of lists of Movie namedtuples. Convert/filter the data: |
| 6 | +Only extract director_name, movie_title, title_year and imdb_score. |
| 7 | +Type conversions: title_year -> int / imdb_score -> float |
| 8 | +Discard any movies older than 1960. |
| 9 | +Here is an extract: |
| 10 | +
|
| 11 | +.... |
| 12 | +{ 'Woody Allen': [ |
| 13 | + Movie(title='Midnight in Paris', year=2011, score=7.7), |
| 14 | + Movie(title='The Curse of the Jade Scorpion', year=2001, score=6.8), |
| 15 | + Movie(title='To Rome with Love', year=2012, score=6.3), .... |
| 16 | + ], ... |
| 17 | +} |
| 18 | +Write the calc_mean_score helper that takes a list of Movie namedtuples and calculates the mean IMDb score, returning the score rounded to 1 decimal place. |
| 19 | +Complete get_average_scores which takes the directors data structure returned by get_movies_by_director (see 1.) and returns a list of tuples (director, average_score) ordered by highest score in descending order. Only take directors into account with >= MIN_MOVIES |
| 20 | +See the tests for more info. This could be tough one, but we really hope you learn a thing or two. Good luck and keep calm and code in Python! |
| 21 | +''' |
| 22 | + |
| 23 | +import csv |
| 24 | +from collections import defaultdict, namedtuple |
| 25 | +import os |
| 26 | +from urllib.request import urlretrieve |
| 27 | + |
| 28 | +BASE_URL = 'https://bites-data.s3.us-east-2.amazonaws.com/' |
| 29 | +TMP = '/tmp' |
| 30 | + |
| 31 | +fname = 'movie_metadata.csv' |
| 32 | +remote = os.path.join(BASE_URL, fname) |
| 33 | +local = os.path.join(TMP, fname) |
| 34 | +urlretrieve(remote, local) |
| 35 | + |
| 36 | +MOVIE_DATA = local |
| 37 | +MIN_MOVIES = 4 |
| 38 | +MIN_YEAR = 1960 |
| 39 | + |
| 40 | +Movie = namedtuple('Movie', 'title year score') |
| 41 | + |
| 42 | + |
| 43 | +def get_movies_by_director(): |
| 44 | + """Extracts all movies from csv and stores them in a dict, |
| 45 | + where keys are directors, and values are a list of movies, |
| 46 | + use the defined Movie namedtuple""" |
| 47 | + directors = defaultdict(list) |
| 48 | + |
| 49 | + with open(local, 'r') as movie_data: |
| 50 | + for element in csv.DictReader(movie_data): |
| 51 | + directors[element['director_name']].append( |
| 52 | + Movie(title=element['movie_title'], year=element['title_year'], score=element['imdb_score'])) |
| 53 | + |
| 54 | + return directors |
| 55 | + |
| 56 | + |
| 57 | +def calc_mean_score(movies): |
| 58 | + """Helper method to calculate mean of list of Movie namedtuples, |
| 59 | + round the mean to 1 decimal place""" |
| 60 | + score = 0 |
| 61 | + for movie in movies: |
| 62 | + score += float(movie.score) |
| 63 | + mean_value = score / len(movies) |
| 64 | + return round(mean_value, ndigits=1) |
| 65 | + |
| 66 | + |
| 67 | +def get_average_scores(directors): |
| 68 | + """Iterate through the directors dict (returned by get_movies_by_director), |
| 69 | + return a list of tuples (director, average_score) ordered by highest |
| 70 | + score in descending order. Only take directors into account |
| 71 | + with >= MIN_MOVIES""" |
| 72 | + |
| 73 | + to_sort = [(director, calc_mean_score(movie_stats)) for director, movie_stats in directors.items() if |
| 74 | + len(movie_stats) >= MIN_MOVIES] |
| 75 | + k = sorted(to_sort, key=lambda x: x[1], reverse=True) |
| 76 | + return k |
| 77 | + |
| 78 | + |
| 79 | +if __name__ == '__main__': |
| 80 | + directors_hash = get_movies_by_director() |
| 81 | + [print(avg) for avg in get_average_scores(directors_hash)] |
0 commit comments