from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Colab Notebooks

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks


import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import re
import ast
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
pd.options.display.max_rows = 10000
pd.options.display.width = 1000


boxOffice_df = pd.read_csv(r'Data/Highest Holywood Grossing Movies.csv', encoding='latin-1') #import box office information 
IMDB_df = pd.read_csv(r'Data/IMDB.csv') #import IMDB
amazon_df = pd.read_csv(r'Data/amazon.csv') #import amazon
inflation_df = pd.read_csv(r'Data/inflation_data.csv') #import inflation


inflation_df.head()


def inflationChange(gross,year):
  year=year-1900
  rate=inflation_df["amount"][year] 
  oldValue=gross/rate #Converts to 1900 currency
  newValue=oldValue*inflation_df.loc[122,'amount'] #Converts to 2022 currency
  return newValue


IMDB_df.head()


IMDB_df=IMDB_df.drop(columns = ["Poster_Link","IMDB_Rating","Overview","Meta_score","Star1",	"Star2","Star3","Star4","No_of_Votes"])
IMDB_df = IMDB_df.rename(columns={'Director':'director','Released_Year': 'release_year','Genre':'listed_in','Series_Title':'title','Certificate':'rating',"Runtime":"duration"})
IMDB_df.head()


IMDB_df["rating"]=IMDB_df["rating"].map({"A": "R","U":"PG","UA":"PG-13","U/A":"PG-13","S":"NC-17"})


IMDB_df=IMDB_df.dropna()
IMDB_df.Gross = IMDB_df['Gross'].str.replace(',', '') #Removes ,


IMDB_df['Gross'] = IMDB_df['Gross'].astype(int)
IMDB_df['duration'] = IMDB_df['duration'].str.replace(r' min$', '', regex=True) #Removes the 'min' from the duration
IMDB_df["duration"]=IMDB_df["duration"].astype(int)


IMDB_df['currentGross']=""
for ind in IMDB_df.index: #Adjusts for inflation
  gross=IMDB_df["Gross"][ind]
  year=IMDB_df["release_year"][ind]
  IMDB_df.loc[ind,"currentGross"]=inflationChange(gross,year)
IMDB_df['currentGross'] = IMDB_df['currentGross'].astype(int)

IMDB_df.head()


boxOffice_df.head()


boxOffice_df=boxOffice_df.drop(columns = ["Unnamed: 0","Movie Info","Distributor","Domestic Sales (in $)","International Sales (in $)"])
boxOffice_df = boxOffice_df.rename(columns={'Title':'title','Release Date':'release_year','World Sales (in $)':'Gross','Genre':'listed_in','Movie Runtime':'duration','License':'rating'})
boxOffice_df=boxOffice_df.dropna()
boxOffice_df.head()


boxOffice_df['release_year'] = boxOffice_df['release_year'].str[-4:] #Makes it the year only
boxOffice_df['listed_in'] = boxOffice_df['listed_in'].str.replace('[', '', regex=True) #Removes [
boxOffice_df['listed_in'] = boxOffice_df['listed_in'].str.replace('\'', '', regex=True) #Removes '
boxOffice_df['listed_in'] = boxOffice_df['listed_in'].str.replace(']', '', regex=True) #Removes ]
boxOffice_df.head()


boxOffice_df['title'] = boxOffice_df['title'].str[:-6] #Removes the year from the title column


def durationConvert(time): #Converts the string runtime into an integer
  hour=time[0:time.find('hr')-1]
  minutes=time[time.find('hr')+3:time.find('min')]
  if(hour==""):
    hour=0
  if(minutes==""):
    minutes=0
  runtime = int(float(hour))*60+ int(float(minutes)) #Creates the new integer runtime
  return runtime
for ind in boxOffice_df.index: #Cycles through the dataframe
  boxOffice_df.loc[ind,"duration"]=durationConvert(boxOffice_df["duration"][ind])
boxOffice_df["release_year"]=boxOffice_df["release_year"].astype(int)
boxOffice_df['currentGross']=""
for ind in boxOffice_df.index: #Adjusts for inflation
  gross=boxOffice_df["Gross"][ind]
  year=boxOffice_df["release_year"][ind]
  boxOffice_df.loc[ind,"currentGross"]=inflationChange(gross,year)
boxOffice_df.head()


amazon_df.head()


amazon_df=amazon_df.drop(columns = ["show_id","cast","country","date_added","description"], axis=1)
amazon_df["rating"]=amazon_df["rating"].map({
    "All_Ages": "G",
    "ALL":"G",
    "7+":"PG",
    "13+":"PG-13",
    "16+":"PG-13",
    "18+":"R",
    "16":"PG-13",
    "AGES_16":"PG-13",
    "AGES_16_":"PG-13",
    "AGES_18":"R",
    "AGES_18_":"R",
    "NOT_RATE":"UNRATED"
})
amazon_df.head()


amazon_df=amazon_df.dropna()
amazon_df=amazon_df[amazon_df['type']=='Movie']
amazon_df=amazon_df.drop(columns = ["type"])
amazon_df.head()


amazon_df['duration'] = amazon_df['duration'].str.replace(r' min$', '', regex=True)
amazon_df["duration"]=amazon_df["duration"].astype(int)
amazon_df.head()


movies_df=pd.concat([IMDB_df, boxOffice_df],axis=0)
movies_df['duration']=movies_df['duration'].astype(int)
movies_df['currentGross']=movies_df['currentGross'].astype(int)
movies_df.head()


movies_df["listed_in_new"] = movies_df["listed_in"].apply(lambda x: x.split(","))
exploded_movies = movies_df.explode("listed_in_new").reset_index(drop=True)
exploded_movies['listed_in_new'] = exploded_movies['listed_in_new'].str.replace(" ", "")
exploded_movies.head()


amazon_df["listed_in_new"] = amazon_df["listed_in"].apply(lambda x: x.split(","))
exploded_amazon = amazon_df.explode("listed_in_new").reset_index(drop=True)
exploded_amazon['listed_in_new'] = exploded_amazon['listed_in_new'].str.replace(" ", "")
exploded_amazon.head()


movies_df["currentGross"].describe()

count    1.287000e+03
mean     4.215479e+08
std      4.266132e+08
min      1.019700e+04
25%      1.446810e+08
50%      3.131492e+08
75%      5.554031e+08
max      4.087621e+09
Name: currentGross, dtype: float64


#Spliting the dataframe in half based on gross revenue
Half1=movies_df[movies_df["currentGross"]<= 3.131669e+08]
Half2=movies_df[movies_df["currentGross"]>= 3.131669e+08]
#Dividing it into quarters
Q1=Half1[Half1["currentGross"]< 1.447342e+08]
Q2=Half1[Half1["currentGross"]> 1.447342e+08]
Q3=Half2[Half2["currentGross"]< 5.560239e+08]
Q4=Half2[Half2["currentGross"]> 5.560239e+08]


#Counts the frequency of the ratings
ratings=movies_df["rating"].value_counts().iloc[:]
ratings1=Q1["rating"].value_counts().iloc[:]
ratings2=Q2["rating"].value_counts().iloc[:]
ratings3=Q3["rating"].value_counts().iloc[:]
ratings4=Q4["rating"].value_counts().iloc[:]

fig, ax= plt.subplots(1,5, figsize=(25,5))
fig.suptitle('Movie Ratings', fontsize=22)
ax[0].pie(ratings1, autopct='%1.1f%%')#Quarter 1
ax[1].pie(ratings2, autopct='%1.1f%%')#Quarter 2
ax[2].pie(ratings3, autopct='%1.1f%%')#Quarter 3
ax[3].pie(ratings4, autopct='%1.1f%%')#Quarter 4
ax[4].pie(ratings, autopct='%1.1f%%')#All Movies
#Labels
labels=movies_df["rating"].unique()
ax[0].set_title("Quarter 1")
ax[1].set_title("Quarter 2")
ax[2].set_title("Quarter 3")
ax[3].set_title("Quarter 4")
ax[4].set_title("All Movies")
plt.legend(labels,bbox_to_anchor=(1.0, 1.0))
utopct='%1.1f%%'


#Creating new dataframes that only contain the duration of each movie
duration=movies_df["duration"]
duration1=Q1["duration"]
duration2=Q2["duration"]
duration3=Q3["duration"]
duration4=Q4["duration"]

fig, ax= plt.subplots(1,4, figsize=(25,8))
fig.suptitle('Movie Runtime', fontsize=22)
#Plotting the histograms
sns.histplot(duration1, ax=ax[0])#Quarter 1
sns.histplot(duration2, ax=ax[1])#Quarter 2
sns.histplot(duration3, ax=ax[2])#Quarter 3
sns.histplot(duration4, ax=ax[3])#Quarter 4
#Labeling
labels=movies_df["rating"].unique()
ax[0].set_title("Quarter 1")
ax[1].set_title("Quarter 2")
ax[2].set_title("Quarter 3")
ax[3].set_title("Quarter 4")

ax[0].set_xlabel("Runtime")
ax[0].set_ylabel("Frequency")
ax[1].set_xlabel("Frequency")
ax[1].set_ylabel("Runtime")
ax[2].set_xlabel("Frequency")
ax[2].set_ylabel("Runtime")
ax[3].set_xlabel("Frequency")
ax[3].set_ylabel("Runtime")

Text(0, 0.5, 'Runtime')


#Grouping the dataframes into gross revenue and release year
release_year=movies_df.groupby("release_year", as_index=False)["currentGross"].mean() #All movies
release_year1=Q1.groupby("release_year", as_index=False)["currentGross"].mean() #Quarter 1
release_year2=Q2.groupby("release_year", as_index=False)["currentGross"].mean() #Quarter 2
release_year3=Q3.groupby("release_year", as_index=False)["currentGross"].mean() #Quarter 3
release_year4=Q4.groupby("release_year", as_index=False)["currentGross"].mean() #Quarter 4
#Creating the lines of best fit for each graph
a1, b1 = np.polyfit(release_year1["release_year"], release_year1["currentGross"], 1)
a2, b2 = np.polyfit(release_year2["release_year"], release_year2["currentGross"], 1)
a3, b3 = np.polyfit(release_year3["release_year"], release_year3["currentGross"], 1)
a4, b4 = np.polyfit(release_year4["release_year"], release_year4["currentGross"], 1)
a, b = np.polyfit(release_year["release_year"], release_year["currentGross"], 1)

fig, ax= plt.subplots(1,5, figsize=(25, 8))
fig.suptitle('Movie Release Year', fontsize=22)
#Plotting the scatter plot
ax[0].scatter(release_year1["release_year"],release_year1["currentGross"])#Quarter 1
ax[1].scatter(release_year2["release_year"],release_year2["currentGross"])#Quarter 2
ax[2].scatter(release_year3["release_year"],release_year3["currentGross"])#Quarter 3
ax[3].scatter(release_year4["release_year"],release_year4["currentGross"])#Quarter 4
ax[4].scatter(release_year["release_year"],release_year["currentGross"])#All movies
#Plotting the lines of best fit
ax[0].plot(release_year1["release_year"], a1*release_year1["release_year"]+b1) 
ax[1].plot(release_year2["release_year"], a2*release_year2["release_year"]+b2) 
ax[2].plot(release_year3["release_year"], a3*release_year3["release_year"]+b3) 
ax[3].plot(release_year4["release_year"], a4*release_year4["release_year"]+b4) 
ax[4].plot(release_year["release_year"], a*release_year["release_year"]+b) 
#Labels
ax[0].set_title("Quarter 1")
ax[1].set_title("Quarter 2")
ax[2].set_title("Quarter 3")
ax[3].set_title("Quarter 4")
ax[4].set_title("All Movies")

ax[0].set_xlabel("Release Year")
ax[0].set_ylabel("Gross Revenue (in millions)")
ax[1].set_xlabel("Release Year")
ax[1].set_ylabel("Gross Revenue (in millions)")
ax[2].set_xlabel("Release Year")
ax[2].set_ylabel("Gross Revenue (in millions)")
ax[3].set_xlabel("Release Year")
ax[3].set_ylabel("Gross Revenue (in billions)")
ax[4].set_xlabel("Release Year")
ax[4].set_ylabel("Gross Revenue (in billions)")

Text(0, 0.5, 'Gross Revenue (in billions)')


#Creates a new dataframe storing movies with only the 5 most popular genres
exploded_movies_new = exploded_movies[exploded_movies['listed_in_new'].isin(exploded_movies['listed_in_new'].value_counts()[:5].index.tolist())]
#Spliting the dataframe in half based on gross revenue
Half1=exploded_movies_new[exploded_movies_new["currentGross"]<= 3.131669e+08]
Half2=exploded_movies_new[exploded_movies_new["currentGross"]>= 3.131669e+08]
#Dividing it into quarters
eQ1=Half1[Half1["currentGross"]< 1.447342e+08]
eQ2=Half1[Half1["currentGross"]> 1.447342e+08]
eQ3=Half2[Half2["currentGross"]< 5.560239e+08]
eQ4=Half2[Half2["currentGross"]> 5.560239e+08]


fig, ax= plt.subplots(1,5, figsize=(25,5))
fig.suptitle('Top 5 Genres', fontsize=22)
ax[0].pie(eQ1["listed_in_new"].value_counts().iloc[:], autopct='%1.1f%%') #Quarter 1
ax[1].pie(eQ2["listed_in_new"].value_counts().iloc[:], autopct='%1.1f%%') #Quarter 2
ax[2].pie(eQ3["listed_in_new"].value_counts().iloc[:], autopct='%1.1f%%') #Quarter 3
ax[3].pie(eQ4["listed_in_new"].value_counts().iloc[:], autopct='%1.1f%%') #Quarter 4
ax[4].pie(exploded_movies_new["listed_in_new"].value_counts().iloc[:], autopct='%1.1f%%') #All movies
labels=exploded_movies_new["listed_in_new"].unique()
#Labels
ax[0].set_title("Quarter 1")
ax[1].set_title("Quarter 2")
ax[2].set_title("Quarter 3")
ax[3].set_title("Quarter 4")
ax[4].set_title("All Movies")
plt.legend(labels,bbox_to_anchor=(1.0, 1.0))
autopct='%1.1f%%'


#Creates a new dataframe storing movies with only the 10 most popular directors
movies_new = movies_df[movies_df['director'].isin(movies_df['director'].value_counts()[:10].index.tolist())] 
#Spliting the dataframe in half based on gross revenue
Half1=movies_new[movies_new["currentGross"]<= 3.131669e+08]
Half2=movies_new[movies_new["currentGross"]>= 3.131669e+08]
#Dividing it into quarters
dQ1=Half1[Half1["currentGross"]< 1.447342e+08]
dQ2=Half1[Half1["currentGross"]> 1.447342e+08]
dQ3=Half2[Half2["currentGross"]< 5.560239e+08]
dQ4=Half2[Half2["currentGross"]> 5.560239e+08]


fig, ax= plt.subplots(1,1, figsize=(10,6))
Spielberg=movies_df[movies_df['director']=='Steven Spielberg'] #Spielberg's Movies
Scorsese=movies_df[movies_df['director']=='Martin Scorsese'] #Scorsese's Movies
Nolan=movies_df[movies_df['director']=='Christopher Nolan'] #Nolan's Movies
Hitchcock=movies_df[movies_df['director']=='Alfred Hitchcock'] #Hitchcock's Movies
Fincher=movies_df[movies_df['director']=='David Fincher'] #Fincher's Movies
Kubrick=movies_df[movies_df['director']=='Stanley Kubrick'] #Kubrick's Movies

#Merging the new data frames
PopularDirector = pd.DataFrame({'Steven Spielberg': Spielberg['currentGross'],
                             'Martin Scorsese': Scorsese['currentGross'],
                             'Christopher Nolan': Nolan['currentGross'],
                             'Alfred Hitchcock': Hitchcock['currentGross'],
                             'David Fincher': Fincher['currentGross'],
                             'Stanley Kubrick':Kubrick['currentGross']})
ax.set_ylabel("Gross Revenue (in billions)")
fig.tight_layout()
sns.boxplot(data=PopularDirector, palette='Set1', ax=ax) #Displaying the graph

<matplotlib.axes._subplots.AxesSubplot at 0x7fd1247f5850>


fig, ax= plt.subplots(1,5, figsize=(25,5))
fig.suptitle('Top 10 Directors', fontsize=22)
#Creates five pie charts 
ax[0].pie(dQ1["director"].value_counts().iloc[:], autopct='%1.1f%%') #Quarter 1
ax[1].pie(dQ2["director"].value_counts().iloc[:], autopct='%1.1f%%') #Quarter 2
ax[2].pie(dQ3["director"].value_counts().iloc[:], autopct='%1.1f%%') #Quarter 3
ax[3].pie(dQ4["director"].value_counts().iloc[:], autopct='%1.1f%%') #Quarter 4
ax[4].pie(movies_new["director"].value_counts().iloc[:], autopct='%1.1f%%') #All movies
#Labeling the graph
labels=movies_new["director"].unique()
ax[0].set_title("Quarter 1")
ax[1].set_title("Quarter 2")
ax[2].set_title("Quarter 3")
ax[3].set_title("Quarter 4")
ax[4].set_title("All Movies")
plt.legend(labels,bbox_to_anchor=(1.0, 1.0))
utopct='%1.1f%%' #Adds percentages to each part of the circle


#Creating the correlation for the top 4 directors
director_exploded_movies= exploded_movies[exploded_movies['director'].isin(exploded_movies['director'].value_counts()[:4].index.tolist())]
director_exploded_movies=director_exploded_movies.drop(columns = ['title','listed_in','listed_in_new','rating'])
director_exploded_movies=pd.get_dummies(director_exploded_movies)
director_exploded_movies=director_exploded_movies.corr()
#Creating the correlation for the ratings
rating_exploded_movies=exploded_movies.drop(columns = ['title','listed_in','listed_in_new','director'])
rating_exploded_movies=pd.get_dummies(rating_exploded_movies)
rating_exploded_movies=rating_exploded_movies.corr()
#Creating the correlation for top 4 genres
genre_exploded_movies= exploded_movies[exploded_movies['listed_in_new'].isin(exploded_movies['listed_in_new'].value_counts()[:4].index.tolist())]
genre_exploded_movies=genre_exploded_movies.drop(columns = ['title','listed_in','director','rating'])
genre_exploded_movies=pd.get_dummies(genre_exploded_movies)
genre_exploded_movies=genre_exploded_movies.corr()
#Creating the heatmap
fig, ax =plt.subplots(1,3, figsize=(20,5))
sns.heatmap(director_exploded_movies, annot=True,ax=ax[0])
sns.heatmap(rating_exploded_movies, annot=True, ax=ax[1])
sns.heatmap(genre_exploded_movies, annot=True, ax=ax[2])
fig.tight_layout()
ax[0].set_title("Directors")
ax[1].set_title("Rating")
ax[2].set_title("Genre")

Text(0.5, 1.0, 'Genre')


exploded_movies=exploded_movies.dropna()
# Define the features.
features = ["duration", "release_year","director","rating","listed_in_new"]
# Define the training data.
X_train_dict = exploded_movies[features].to_dict(orient="records")
X_dict = exploded_movies[features].to_dict(orient="records")
y=exploded_movies["currentGross"]
# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
model = KNeighborsRegressor(n_neighbors=20)
pipeline = Pipeline([
    ("scaler", scaler),
    ("model", model)
])
cross_val_score(pipeline, X_train, y, 
                cv=15, scoring="neg_mean_absolute_error")
# calculates estimate of test error based on 15-fold cross validation
def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    mae = np.absolute(-cross_val_score(
        pipeline, X_dict, y, 
        cv=15, scoring="neg_mean_absolute_error"
    )).mean()
    return mae
ks = pd.Series(range(1, 51))
ks.index = range(1, 51)
ax = ks.apply(get_cv_error)
ax.argmin()

2


def prediction(duration, release_year, rating, listed_in_new, director):
  # Define the features.
  features = ["duration", "release_year","director","rating","listed_in_new"]
  # Define the training data.
  X_train_dict = exploded_movies[features].to_dict(orient="records")
  X_dict = exploded_movies[features].to_dict(orient="records")
  y=exploded_movies["currentGross"]
  X_new_dict = [{
      "duration":duration,
      "release_year":release_year,
      "rating":rating,
      "listed_in_new":listed_in_new,
      "director":director
  }]
  y_train = exploded_movies["currentGross"]

  # Dummy encoding
  vec = DictVectorizer(sparse=False)
  vec.fit(X_train_dict)
  X_train = vec.transform(X_train_dict)
  X_new = vec.transform(X_new_dict)

  # Standardization
  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train_sc = scaler.transform(X_train)
  X_new_sc = scaler.transform(X_new)

  # K-Nearest Neighbors Model
  model = KNeighborsRegressor(n_neighbors=2)
  model.fit(X_train_sc, y_train)
  x=model.predict(X_new_sc)
  revenue=x[0]
  return revenue


print(prediction(152, 2008, 'PG-13', 'Action', 'Christopher Nolan'))

570140931.5


exploded_amazon["currentGross"]=0 #Makes a new column called currentGross and sets all values equal to 0
for ind in range(100): #Loops through the first 100 movies
  duration=exploded_amazon["duration"][ind]
  release_year=exploded_amazon["release_year"][ind]
  rating=exploded_amazon["rating"][ind]
  listed_in_new=exploded_amazon["listed_in_new"][ind]
  director=exploded_amazon["director"][ind]
  exploded_amazon.loc[ind,"currentGross"]=prediction(duration, release_year, rating, listed_in_new, director)
exploded_amazon["currentGross"]=exploded_amazon["currentGross"].astype(int)


exploded_amazon.sort_values(by=['currentGross'], ascending=False).head()


X_train_sc = scaler.transform(X_train)
model = KNeighborsRegressor(n_neighbors=2)
model.fit(X_train_sc, y)
y_train_pred=model.predict(X_train_sc)
accuracy = (y_train_pred == y).mean()
accuracy

0.5224839400428265

	year	amount	inflation rate
0	1900	1.00	0.01
1	1901	1.01	0.01
2	1902	1.02	0.01
3	1903	1.05	0.02
4	1904	1.06	0.01

	Poster_Link	Series_Title	Released_Year	Certificate	Runtime	Genre	IMDB_Rating	Overview	Meta_score	Director	Star1	Star2	Star3	Star4	No_of_Votes	Gross
0	https://m.media-amazon.com/images/M/MV5BMDFkYT...	The Shawshank Redemption	1994	A	142 min	Drama	9.3	Two imprisoned men bond over a number of years...	80.0	Frank Darabont	Tim Robbins	Morgan Freeman	Bob Gunton	William Sadler	2343110	28,341,469
1	https://m.media-amazon.com/images/M/MV5BM2MyNj...	The Godfather	1972	A	175 min	Crime, Drama	9.2	An organized crime dynasty's aging patriarch t...	100.0	Francis Ford Coppola	Marlon Brando	Al Pacino	James Caan	Diane Keaton	1620367	134,966,411
2	https://m.media-amazon.com/images/M/MV5BMTMxNT...	The Dark Knight	2008	UA	152 min	Action, Crime, Drama	9.0	When the menace known as the Joker wreaks havo...	84.0	Christopher Nolan	Christian Bale	Heath Ledger	Aaron Eckhart	Michael Caine	2303232	534,858,444
3	https://m.media-amazon.com/images/M/MV5BMWMwMG...	The Godfather: Part II	1974	A	202 min	Crime, Drama	9.0	The early life and career of Vito Corleone in ...	90.0	Francis Ford Coppola	Al Pacino	Robert De Niro	Robert Duvall	Diane Keaton	1129952	57,300,000
4	https://m.media-amazon.com/images/M/MV5BMWU4N2...	12 Angry Men	1957	U	96 min	Crime, Drama	9.0	A jury holdout attempts to prevent a miscarria...	96.0	Sidney Lumet	Henry Fonda	Lee J. Cobb	Martin Balsam	John Fiedler	689845	4,360,000

	title	release_year	rating	duration	listed_in	director	Gross
0	The Shawshank Redemption	1994	A	142 min	Drama	Frank Darabont	28,341,469
1	The Godfather	1972	A	175 min	Crime, Drama	Francis Ford Coppola	134,966,411
2	The Dark Knight	2008	UA	152 min	Action, Crime, Drama	Christopher Nolan	534,858,444
3	The Godfather: Part II	1974	A	202 min	Crime, Drama	Francis Ford Coppola	57,300,000
4	12 Angry Men	1957	U	96 min	Crime, Drama	Sidney Lumet	4,360,000

	title	release_year	rating	duration	listed_in	director	Gross	currentGross
0	The Shawshank Redemption	1994	R	142	Drama	Frank Darabont	28341469	57004269
1	The Godfather	1972	R	175	Crime, Drama	Francis Ford Coppola	134966411	961567924
2	The Dark Knight	2008	PG-13	152	Action, Crime, Drama	Christopher Nolan	534858444	740412703
3	The Godfather: Part II	1974	R	202	Crime, Drama	Francis Ford Coppola	57300000	346337989
4	12 Angry Men	1957	PG	96	Crime, Drama	Sidney Lumet	4360000	46176955

	Unnamed: 0	Title	Movie Info	Distributor	Release Date	Domestic Sales (in $)	International Sales (in $)	World Sales (in $)	Genre	Movie Runtime	License
0	0	Star Wars: Episode VII - The Force Awakens (2015)	As a new threat to the galaxy rises, Rey, a de...	Walt Disney Studios Motion Pictures	December 16, 2015	936662225	1132859475	2069521700	['Action', 'Adventure', 'Sci-Fi']	2 hr 18 min	PG-13
1	1	Avengers: Endgame (2019)	After the devastating events of Avengers: Infi...	Walt Disney Studios Motion Pictures	April 24, 2019	858373000	1939128328	2797501328	['Action', 'Adventure', 'Drama', 'Sci-Fi']	3 hr 1 min	PG-13
2	2	Avatar (2009)	A paraplegic Marine dispatched to the moon Pan...	Twentieth Century Fox	December 16, 2009	760507625	2086738578	2847246203	['Action', 'Adventure', 'Fantasy', 'Sci-Fi']	2 hr 42 min	PG-13
3	3	Black Panther (2018)	T'Challa, heir to the hidden but advanced king...	Walt Disney Studios Motion Pictures	NaN	700426566	647171407	1347597973	['Action', 'Adventure', 'Sci-Fi']	2 hr 14 min	NaN
4	4	Avengers: Infinity War (2018)	The Avengers and their allies must be willing ...	Walt Disney Studios Motion Pictures	NaN	678815482	1369544272	2048359754	['Action', 'Adventure', 'Sci-Fi']	2 hr 29 min	NaN

Predictive Film Revenue¶

Introduction¶

Collaboration Plan¶

Importing Libraries¶

Datasets¶

Extraction, Transform, and Load¶

Merging the Data¶

Exploratory Data Analysis¶

Prediction Model¶

Application¶

Conclusion¶

	title	release_year	Gross	listed_in	duration	rating
0	Star Wars: Episode VII - The Force Awakens (2015)	2015	2069521700	Action, Adventure, Sci-Fi	2 hr 18 min	PG-13
1	Avengers: Endgame (2019)	2019	2797501328	Action, Adventure, Drama, Sci-Fi	3 hr 1 min	PG-13
2	Avatar (2009)	2009	2847246203	Action, Adventure, Fantasy, Sci-Fi	2 hr 42 min	PG-13
6	Titanic (1997)	1997	2201647264	Drama, Romance	3 hr 14 min	PG-13
7	Jurassic World (2015)	2015	1670516444	Action, Adventure, Sci-Fi	2 hr 4 min	PG-13

	show_id	type	title	director	cast	country	date_added	release_year	rating	duration	listed_in	description
0	s1	Movie	The Grand Seduction	Don McKellar	Brendan Gleeson, Taylor Kitsch, Gordon Pinsent	Canada	March 30, 2021	2014	NaN	113 min	Comedy, Drama	A small fishing village must procure a local d...
1	s2	Movie	Take Care Good Night	Girish Joshi	Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar	India	March 30, 2021	2018	13+	110 min	Drama, International	A Metro Family decides to fight a Cyber Crimin...
2	s3	Movie	Secrets of Deception	Josh Webber	Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...	United States	March 30, 2021	2017	NaN	74 min	Action, Drama, Suspense	After a man discovers his wife is cheating on ...
3	s4	Movie	Pink: Staying True	Sonia Anderson	Interviews with: Pink, Adele, Beyoncé, Britney...	United States	March 30, 2021	2014	NaN	69 min	Documentary	Pink breaks the mold once again, bringing her ...
4	s5	Movie	Monster Maker	Giles Foster	Harry Dean Stanton, Kieran O'Brien, George Cos...	United Kingdom	March 30, 2021	1989	NaN	45 min	Drama, Fantasy	Teenage Matt Banting wants to work with a famo...

	title	director	release_year	rating	duration	listed_in	listed_in_new	currentGross
30	Yellowneck	R. John Hugh	1955	R	84	Action, Military and War, Western	MilitaryandWar	1127559762
95	Western Gold	Howard Bretherton	1937	R	56	Military and War, Western	MilitaryandWar	1127559762
91	What Price Crime	Albert Herman	1935	R	65	Suspense	Suspense	1127559762
40	Women of Valor	Buzz Kulik	1986	PG-13	95	Action, Drama	Action	821961324
87	When the Lights Go On Again	William K. Howard	1944	PG-13	103	Drama, Military and War	MilitaryandWar	740089392

	title	director	release_year	rating	duration	listed_in
1	Take Care Good Night	Girish Joshi	2018	PG-13	110	Drama, International
16	Zoombies	Glenn Miller	2016	PG-13	87	Horror, Science Fiction
25	Zambezi: Force of Life	Dr. Rudolf Lammers	2005	G	50	Documentary
32	Young Lovers	Ida Lupino	1949	R	86	Drama
38	Yoga with Sound Meditations	Mark Knight	2016	G	9	Fitness, Special Interest