Skip to content

Commit ac721e7

Browse files
committed
Completed bite 30
1 parent c2a5fb0 commit ac721e7

File tree

3 files changed

+81
-0
lines changed

3 files changed

+81
-0
lines changed
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
'''
2+
from: https://codechalleng.es/bites/30/#console
3+
In this Bite we are going to parse a csv movie dataset to identify the directors with the highest rated movies.
4+
5+
Write get_movies_by_director: use csv.DictReader to convert movie_metadata.csv into a (default)dict of lists of Movie namedtuples. Convert/filter the data:
6+
Only extract director_name, movie_title, title_year and imdb_score.
7+
Type conversions: title_year -> int / imdb_score -> float
8+
Discard any movies older than 1960.
9+
Here is an extract:
10+
11+
....
12+
{ 'Woody Allen': [
13+
Movie(title='Midnight in Paris', year=2011, score=7.7),
14+
Movie(title='The Curse of the Jade Scorpion', year=2001, score=6.8),
15+
Movie(title='To Rome with Love', year=2012, score=6.3), ....
16+
], ...
17+
}
18+
Write the calc_mean_score helper that takes a list of Movie namedtuples and calculates the mean IMDb score, returning the score rounded to 1 decimal place.
19+
Complete get_average_scores which takes the directors data structure returned by get_movies_by_director (see 1.) and returns a list of tuples (director, average_score) ordered by highest score in descending order. Only take directors into account with >= MIN_MOVIES
20+
See the tests for more info. This could be tough one, but we really hope you learn a thing or two. Good luck and keep calm and code in Python!
21+
'''
22+
23+
import csv
24+
from collections import defaultdict, namedtuple
25+
import os
26+
from urllib.request import urlretrieve
27+
28+
BASE_URL = 'https://bites-data.s3.us-east-2.amazonaws.com/'
29+
TMP = '/tmp'
30+
31+
fname = 'movie_metadata.csv'
32+
remote = os.path.join(BASE_URL, fname)
33+
local = os.path.join(TMP, fname)
34+
urlretrieve(remote, local)
35+
36+
MOVIE_DATA = local
37+
MIN_MOVIES = 4
38+
MIN_YEAR = 1960
39+
40+
Movie = namedtuple('Movie', 'title year score')
41+
42+
43+
def get_movies_by_director():
44+
"""Extracts all movies from csv and stores them in a dict,
45+
where keys are directors, and values are a list of movies,
46+
use the defined Movie namedtuple"""
47+
directors = defaultdict(list)
48+
49+
with open(local, 'r') as movie_data:
50+
for element in csv.DictReader(movie_data):
51+
directors[element['director_name']].append(
52+
Movie(title=element['movie_title'], year=element['title_year'], score=element['imdb_score']))
53+
54+
return directors
55+
56+
57+
def calc_mean_score(movies):
58+
"""Helper method to calculate mean of list of Movie namedtuples,
59+
round the mean to 1 decimal place"""
60+
score = 0
61+
for movie in movies:
62+
score += float(movie.score)
63+
mean_value = score / len(movies)
64+
return round(mean_value, ndigits=1)
65+
66+
67+
def get_average_scores(directors):
68+
"""Iterate through the directors dict (returned by get_movies_by_director),
69+
return a list of tuples (director, average_score) ordered by highest
70+
score in descending order. Only take directors into account
71+
with >= MIN_MOVIES"""
72+
73+
to_sort = [(director, calc_mean_score(movie_stats)) for director, movie_stats in directors.items() if
74+
len(movie_stats) >= MIN_MOVIES]
75+
k = sorted(to_sort, key=lambda x: x[1], reverse=True)
76+
return k
77+
78+
79+
if __name__ == '__main__':
80+
directors_hash = get_movies_by_director()
81+
[print(avg) for avg in get_average_scores(directors_hash)]

0 commit comments

Comments
 (0)