import pandas as pd
import seaborn
import matplotlib.pyplot as plt
import datetime as dt
from datetime import date
from dateutil.relativedelta import relativedelta
!pip install yahoo_fin
from yahoo_fin import stock_info as si
def stock_data(stock_name):
df = si.get_data(stock_name, start_date=(date.today() - relativedelta(years=3)).strftime('%m/%d/%Y'),
end_date=date.today().strftime('%m/%d/%Y'))
return df
ticker='HD'
company_name='Home Depot'
df = stock_data(ticker)
df.head()
open | high | low | close | adjclose | volume | ticker | |
---|---|---|---|---|---|---|---|
2020-06-08 | 252.490005 | 256.809998 | 252.259995 | 256.769989 | 239.264954 | 3811900 | HD |
2020-06-09 | 255.330002 | 258.290009 | 253.860001 | 256.760010 | 239.255646 | 3716100 | HD |
2020-06-10 | 257.450012 | 259.290009 | 254.220001 | 254.449997 | 237.103104 | 3589800 | HD |
2020-06-11 | 248.860001 | 250.619995 | 238.740005 | 239.470001 | 223.144394 | 6563700 | HD |
2020-06-12 | 243.070007 | 246.389999 | 237.050003 | 242.449997 | 225.921204 | 5238500 | HD |
df.tail()
open | high | low | close | adjclose | volume | ticker | |
---|---|---|---|---|---|---|---|
2023-05-31 | 289.589996 | 290.000000 | 281.959991 | 283.450012 | 283.450012 | 18288800 | HD |
2023-06-01 | 284.049988 | 289.220001 | 279.980011 | 288.390015 | 288.390015 | 4305100 | HD |
2023-06-02 | 290.649994 | 296.209991 | 289.720001 | 295.940002 | 295.940002 | 4514600 | HD |
2023-06-05 | 295.619995 | 295.720001 | 291.369995 | 293.100006 | 293.100006 | 3028800 | HD |
2023-06-06 | 291.820007 | 296.920013 | 291.649994 | 296.000000 | 296.000000 | 2854800 | HD |
#Add target values for two weeks and one month.
#Dataset has values for weekdays only, so two weeks will be 10 data points
# and one month will be 20 data points
df['TwoWeeks'] = df['close'].rolling(10).mean()
df['Month'] = df['close'].rolling(20).mean()
df.head()
open | high | low | close | adjclose | volume | ticker | TwoWeeks | Month | |
---|---|---|---|---|---|---|---|---|---|
2020-06-08 | 252.490005 | 256.809998 | 252.259995 | 256.769989 | 239.264938 | 3811900 | HD | NaN | NaN |
2020-06-09 | 255.330002 | 258.290009 | 253.860001 | 256.760010 | 239.255646 | 3716100 | HD | NaN | NaN |
2020-06-10 | 257.450012 | 259.290009 | 254.220001 | 254.449997 | 237.103119 | 3589800 | HD | NaN | NaN |
2020-06-11 | 248.860001 | 250.619995 | 238.740005 | 239.470001 | 223.144379 | 6563700 | HD | NaN | NaN |
2020-06-12 | 243.070007 | 246.389999 | 237.050003 | 242.449997 | 225.921188 | 5238500 | HD | NaN | NaN |
df.dropna(inplace=True)
df.head()
open | high | low | close | adjclose | volume | ticker | TwoWeeks | Month | |
---|---|---|---|---|---|---|---|---|---|
2020-07-06 | 250.270004 | 251.500000 | 247.039993 | 249.550003 | 232.537186 | 3133800 | HD | 247.481999 | 248.1370 |
2020-07-07 | 247.369995 | 250.779999 | 247.070007 | 247.350006 | 230.487167 | 2927800 | HD | 247.300999 | 247.6660 |
2020-07-08 | 247.869995 | 249.789993 | 246.220001 | 249.169998 | 232.183075 | 2294000 | HD | 247.187000 | 247.2865 |
2020-07-09 | 249.660004 | 250.509995 | 246.350006 | 247.960007 | 231.055588 | 2994700 | HD | 247.370000 | 246.9620 |
2020-07-10 | 248.289993 | 250.330002 | 246.639999 | 250.110001 | 233.058990 | 2745300 | HD | 247.842999 | 247.4940 |
!pip install pytrends
import pytrends
from pytrends.request import TrendReq
pytrends = TrendReq()
kw_list=['Home Depot']
#Gather Google trends for ticker 'HD' and Home Depot
pytrends.build_payload(kw_list, geo='', timeframe='{} {}'.format((date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'),date.today().strftime('%Y-%m-%d')))#timeframe='2010-06-29 2023-05-16')
keyword_interest = pytrends.interest_over_time()
del keyword_interest['isPartial']
keyword_interest.columns.rename("{}".format(ticker))
keyword_interest.head()
Home Depot | |
---|---|
date | |
2020-06-07 | 100 |
2020-06-14 | 100 |
2020-06-21 | 96 |
2020-06-28 | 97 |
2020-07-05 | 90 |
keyword_interest['Home Depot'].plot(title='Google Trends for \'Home Depot\' in the last three years')
<AxesSubplot: title={'center': "Google Trends for 'Home Depot' in the last three years"}>
df_combined = pd.concat([df, keyword_interest], axis=1)
#df_combined.dropna(subset='Month', inplace=True)
df_combined.head(20)
open | high | low | close | adjclose | volume | ticker | TwoWeeks | Month | Home Depot | |
---|---|---|---|---|---|---|---|---|---|---|
2020-06-07 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 100.0 |
2020-06-14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 100.0 |
2020-06-21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 96.0 |
2020-06-28 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 97.0 |
2020-07-05 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 90.0 |
2020-07-06 | 250.270004 | 251.500000 | 247.039993 | 249.550003 | 232.537186 | 3133800.0 | HD | 247.481999 | 248.137000 | NaN |
2020-07-07 | 247.369995 | 250.779999 | 247.070007 | 247.350006 | 230.487167 | 2927800.0 | HD | 247.300999 | 247.666000 | NaN |
2020-07-08 | 247.869995 | 249.789993 | 246.220001 | 249.169998 | 232.183075 | 2294000.0 | HD | 247.187000 | 247.286500 | NaN |
2020-07-09 | 249.660004 | 250.509995 | 246.350006 | 247.960007 | 231.055588 | 2994700.0 | HD | 247.370000 | 246.962000 | NaN |
2020-07-10 | 248.289993 | 250.330002 | 246.639999 | 250.110001 | 233.058990 | 2745300.0 | HD | 247.842999 | 247.494000 | NaN |
2020-07-12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 86.0 |
2020-07-13 | 251.919998 | 257.859985 | 249.089996 | 249.619995 | 232.602386 | 4437500.0 | HD | 248.703999 | 247.852500 | NaN |
2020-07-14 | 249.000000 | 258.179993 | 248.460007 | 257.790009 | 240.215408 | 4614200.0 | HD | 249.871001 | 248.674001 | NaN |
2020-07-15 | 260.140015 | 261.290009 | 255.149994 | 257.799988 | 240.224716 | 4343700.0 | HD | 250.600000 | 249.066500 | NaN |
2020-07-16 | 256.760010 | 260.500000 | 256.000000 | 258.079987 | 240.485641 | 2511200.0 | HD | 251.592999 | 249.427999 | NaN |
2020-07-17 | 260.029999 | 260.649994 | 257.720001 | 260.380005 | 242.628845 | 3091300.0 | HD | 252.781000 | 249.986499 | NaN |
2020-07-19 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 85.0 |
2020-07-20 | 259.040009 | 261.200012 | 258.019989 | 260.170013 | 242.433228 | 2451100.0 | HD | 253.843001 | 250.662500 | NaN |
2020-07-21 | 261.619995 | 263.869995 | 260.720001 | 262.420013 | 244.529800 | 2439900.0 | HD | 255.350002 | 251.325500 | NaN |
2020-07-22 | 262.589996 | 265.589996 | 262.000000 | 265.170013 | 247.092316 | 2750000.0 | HD | 256.950003 | 252.068501 | NaN |
Google Trends give values for Sundays only when the data is pulled in years. I will move Sunday values to the week after since we are concerned with trends rather than daily values.
df_combined['Home Depot'] = df_combined['Home Depot'].fillna(method='ffill')
df_combined.dropna(inplace=True)
df_combined.head(30)
open | high | low | close | adjclose | volume | ticker | TwoWeeks | Month | Home Depot | |
---|---|---|---|---|---|---|---|---|---|---|
2020-07-06 | 250.270004 | 251.500000 | 247.039993 | 249.550003 | 232.537186 | 3133800.0 | HD | 247.481999 | 248.137000 | 90.0 |
2020-07-07 | 247.369995 | 250.779999 | 247.070007 | 247.350006 | 230.487167 | 2927800.0 | HD | 247.300999 | 247.666000 | 90.0 |
2020-07-08 | 247.869995 | 249.789993 | 246.220001 | 249.169998 | 232.183075 | 2294000.0 | HD | 247.187000 | 247.286500 | 90.0 |
2020-07-09 | 249.660004 | 250.509995 | 246.350006 | 247.960007 | 231.055588 | 2994700.0 | HD | 247.370000 | 246.962000 | 90.0 |
2020-07-10 | 248.289993 | 250.330002 | 246.639999 | 250.110001 | 233.058990 | 2745300.0 | HD | 247.842999 | 247.494000 | 90.0 |
2020-07-13 | 251.919998 | 257.859985 | 249.089996 | 249.619995 | 232.602386 | 4437500.0 | HD | 248.703999 | 247.852500 | 86.0 |
2020-07-14 | 249.000000 | 258.179993 | 248.460007 | 257.790009 | 240.215408 | 4614200.0 | HD | 249.871001 | 248.674001 | 86.0 |
2020-07-15 | 260.140015 | 261.290009 | 255.149994 | 257.799988 | 240.224716 | 4343700.0 | HD | 250.600000 | 249.066500 | 86.0 |
2020-07-16 | 256.760010 | 260.500000 | 256.000000 | 258.079987 | 240.485641 | 2511200.0 | HD | 251.592999 | 249.427999 | 86.0 |
2020-07-17 | 260.029999 | 260.649994 | 257.720001 | 260.380005 | 242.628845 | 3091300.0 | HD | 252.781000 | 249.986499 | 86.0 |
2020-07-20 | 259.040009 | 261.200012 | 258.019989 | 260.170013 | 242.433228 | 2451100.0 | HD | 253.843001 | 250.662500 | 85.0 |
2020-07-21 | 261.619995 | 263.869995 | 260.720001 | 262.420013 | 244.529800 | 2439900.0 | HD | 255.350002 | 251.325500 | 85.0 |
2020-07-22 | 262.589996 | 265.589996 | 262.000000 | 265.170013 | 247.092316 | 2750000.0 | HD | 256.950003 | 252.068501 | 85.0 |
2020-07-23 | 267.799988 | 267.799988 | 261.799988 | 263.809998 | 245.824997 | 2680100.0 | HD | 258.535002 | 252.952501 | 85.0 |
2020-07-24 | 265.040009 | 266.890015 | 262.989990 | 265.309998 | 247.222717 | 2984500.0 | HD | 260.055002 | 253.949001 | 85.0 |
2020-07-27 | 265.089996 | 268.679993 | 265.089996 | 267.420013 | 249.188919 | 2412500.0 | HD | 261.835004 | 255.269501 | 85.0 |
2020-07-28 | 268.559998 | 269.070007 | 264.670013 | 265.279999 | 247.194794 | 2227000.0 | HD | 262.584003 | 256.227502 | 85.0 |
2020-07-29 | 264.799988 | 267.109985 | 264.170013 | 264.660004 | 246.617065 | 2874100.0 | HD | 263.270004 | 256.935002 | 85.0 |
2020-07-30 | 263.339996 | 267.350006 | 261.549988 | 266.309998 | 248.154556 | 2347800.0 | HD | 264.093005 | 257.843002 | 85.0 |
2020-07-31 | 265.000000 | 267.170013 | 260.609985 | 265.489990 | 247.390457 | 3640600.0 | HD | 264.604004 | 258.692502 | 85.0 |
2020-08-03 | 266.730011 | 268.579987 | 265.670013 | 266.179993 | 248.033463 | 2363500.0 | HD | 265.205002 | 259.524001 | 80.0 |
2020-08-04 | 266.630005 | 267.890015 | 263.839996 | 267.869995 | 249.608231 | 2224000.0 | HD | 265.750000 | 260.550001 | 80.0 |
2020-08-05 | 268.390015 | 268.390015 | 265.890015 | 267.480011 | 249.244797 | 1959900.0 | HD | 265.981000 | 261.465501 | 80.0 |
2020-08-06 | 266.600006 | 270.440002 | 266.529999 | 269.369995 | 251.005936 | 2203400.0 | HD | 266.537000 | 262.536001 | 80.0 |
2020-08-07 | 270.609985 | 274.920013 | 269.809998 | 271.640015 | 253.121140 | 2846300.0 | HD | 267.170001 | 263.612502 | 80.0 |
2020-08-10 | 272.420013 | 275.000000 | 271.799988 | 274.730011 | 256.000580 | 2393100.0 | HD | 267.901001 | 264.868002 | 79.0 |
2020-08-11 | 277.690002 | 279.369995 | 274.410004 | 274.920013 | 256.177612 | 3321300.0 | HD | 268.865002 | 265.724503 | 79.0 |
2020-08-12 | 279.750000 | 282.970001 | 276.959991 | 281.579987 | 262.383514 | 3867900.0 | HD | 270.557001 | 266.913503 | 79.0 |
2020-08-13 | 281.160004 | 282.649994 | 279.739990 | 281.660004 | 262.458069 | 2202400.0 | HD | 272.092001 | 268.092503 | 79.0 |
2020-08-14 | 281.140015 | 282.000000 | 279.190002 | 280.549988 | 261.423767 | 2490400.0 | HD | 273.598001 | 269.101003 | 79.0 |
!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
kw_list
['HD', 'Home Depot']
# The following code collects data related to 'HD' and Home Depot from
# "markets" and "MarketWatch" twitter accounts published in the last three years
import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime as dt
from datetime import date
# Creating list to append tweet data to
attributes_container = []
sources_all =['CNBC','cnn', 'cnnbrk','MarketWatch', 'Benzinga', 'Stocktwits','BreakoutStocks',
'bespokeinvest','WSJMarkets','Stephanie_Link','nytimesbusiness','IBDinvestors','WSJDealJournal',
'business', 'TheEconomist','WSJ', 'ABC', 'CBSNews','FoxNews', 'NBCNews']
sources=['markets','MarketWatch']
for tweet in sntwitter.TwitterSearchScraper('from:{} since:{} until:{}'.format(sources[0], (date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'), date.today().strftime('%Y-%m-%d'))).get_items():
if 'Home Depot'.lower() in tweet.content.lower():
attributes_container.append([tweet.date, tweet.content.split('http')[0]])
tweets_df = pd.DataFrame(attributes_container, columns=["Date Created", "Tweets"])
tweets_df.drop_duplicates(subset='Tweets',inplace=True)
attributes_container = []
for tweet in sntwitter.TwitterSearchScraper('from:{} since:{} until:{}'.format(sources[1], (date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'), date.today().strftime('%Y-%m-%d'))).get_items():
if 'Home Depot'.lower() in tweet.content.lower():
attributes_container.append([tweet.date, tweet.content.split('http')[0]])
# Creating a dataframe from the tweets list above
tweets_df_marketwatch = pd.DataFrame(attributes_container, columns=["Date Created", "Tweets"])
print(tweets_df_marketwatch.shape)
tweets_df_marketwatch.drop_duplicates(subset='Tweets',inplace=True)
pd.set_option("max_colwidth", None)
tweets_df.head()
Date Created | Tweets | |
---|---|---|
0 | 2023-03-01 18:05:43+00:00 | From Meta to Home Depot, corporate America is talking about AI on earnings calls |
1 | 2023-02-21 19:06:41+00:00 | Home Depot, Walmart and DocuSign.\n\n@RitikaGuptaTV has your stocks to watch this Tuesday |
2 | 2023-02-21 15:15:07+00:00 | Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers |
3 | 2022-11-15 18:11:03+00:00 | Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned |
4 | 2022-08-16 15:09:26+00:00 | Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off |
tweets_df_marketwatch.shape
(1159, 3)
!pip install pygooglenews
kw_list
['Home Depot']
import pygooglenews
from pygooglenews import GoogleNews
gn = GoogleNews()
headlines_related = []
date_list = pd.date_range(end=date.today().strftime('%Y-%m-%d'), start=(date.today()-relativedelta(years=3)).strftime('%Y-%m-%d')).tolist()
clean_date = [str(i).split(" ")[0] for i in date_list]
for date in clean_date:
headlines = []
for word in kw_list:
search =gn.search(word,when=date)
for item in search['entries']:
headlines.append(item['title'])
#We have headlines of the news whose body may contain one of the keywords.
#We will use only headlines to make predictions but some headlines do not contain any of the keywords.
#Only include headlines that has one of the keywords
for headline in headlines:
for i in range(len(kw_list)):
if kw_list[i] in headline:
if headline not in headlines_related:
headlines_related.append((date,headline))
len(headlines_related)
1739
headlines_related[:10]
[('2020-06-08', "Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... - USA TODAY"), ('2020-06-17', "Here's the Defining Characteristic of Home Depot's Success - Nasdaq"), ('2020-06-17', 'Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more - 9to5Toys'), ('2020-06-18', 'Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more - 9to5Toys'), ('2020-06-24', '1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa - OCRegister'), ('2020-06-25', "'It's dehumanizing': Home Depot employee felt colleague's racist ... - Hamilton Spectator"), ('2020-06-30', 'Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna-Glo ... - USA TODAY'), ('2020-06-30', 'The Home Depot Announces Renewable Energy Goal and Pledges ... - PR Newswire'), ('2020-07-01', 'Home Depot bans some rope sales after nooses were found tied on ... - Courier Journal'), ('2020-07-02', 'Home Depot changes rope sales practice after nooses are found in store - CNN')]
df_news = pd.DataFrame(headlines_related, columns=['Date', 'Headline'])
df_news['Headline'] = df_news['Headline'].apply(lambda x: x.split('-')[0])
df_news['Date'] = pd.to_datetime(df_news['Date'])
print(df_news.shape)
df_news.drop_duplicates(subset='Headline', inplace=True)
print(df_news.shape)
df_news.head()
(1739, 2) (1373, 2)
Date | Headline | |
---|---|---|
0 | 2020-06-08 | Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... |
1 | 2020-06-17 | Here's the Defining Characteristic of Home Depot's Success |
2 | 2020-06-17 | Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more |
3 | 2020-06-18 | Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more |
4 | 2020-06-24 | 1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa |
!pip install vaderSentiment
tweets_df_marketwatch.drop("User Name", axis=1, inplace=True)
#combine tweets coming from markets, and marketwatch
markets = pd.concat([tweets_df, tweets_df_marketwatch])
markets.shape
(1169, 2)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def vader(row):
comp = sid.polarity_scores(row)['compound']
return comp
import re
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("amp;","&", x))
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("\n"," ", x))
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("\xa0"," ", x))
markets.head()
Date Created | Tweets | |
---|---|---|
0 | 2023-03-01 18:05:43+00:00 | From Meta to Home Depot, corporate America is talking about AI on earnings calls |
1 | 2023-02-21 19:06:41+00:00 | Home Depot, Walmart and DocuSign. @RitikaGuptaTV has your stocks to watch this Tuesday |
2 | 2023-02-21 15:15:07+00:00 | Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers |
3 | 2022-11-15 18:11:03+00:00 | Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned |
4 | 2022-08-16 15:09:26+00:00 | Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off |
markets['Date Created'] = markets['Date Created'].astype('str')
markets['Date Created'] = markets['Date Created'].apply(lambda x:x[:11])
markets.head()
Date Created | Tweets | |
---|---|---|
0 | 2023-03-01 | From Meta to Home Depot, corporate America is talking about AI on earnings calls |
1 | 2023-02-21 | Home Depot, Walmart and DocuSign. @RitikaGuptaTV has your stocks to watch this Tuesday |
2 | 2023-02-21 | Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers |
3 | 2022-11-15 | Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned |
4 | 2022-08-16 | Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off |
markets['Date Created'] = pd.to_datetime(markets['Date Created'])
markets.sort_values(by='Date Created', inplace=True)
markets.index =range(markets.shape[0])
markets.head()
Date Created | Tweets | |
---|---|---|
0 | 2020-06-06 | What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH: |
1 | 2020-06-08 | Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says |
2 | 2020-06-09 | HD Supply misses on profit expectations but beats on sales |
3 | 2020-06-11 | Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel. |
4 | 2020-06-11 | Should I tell my sister that her husband, a notorious spender, has a secret credit card? |
markets['VaderSent'] = markets['Tweets'].apply(vader)
markets
Date Created | Tweets | VaderSent | |
---|---|---|---|
0 | 2020-06-06 | What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH: | -0.6310 |
1 | 2020-06-08 | Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says | 0.0000 |
2 | 2020-06-09 | HD Supply misses on profit expectations but beats on sales | 0.1280 |
3 | 2020-06-11 | Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel. | 0.0000 |
4 | 2020-06-11 | Should I tell my sister that her husband, a notorious spender, has a secret credit card? | -0.0772 |
... | ... | ... | ... |
1164 | 2023-05-30 | Trans designer in Target anti-LGBTQ+ backlash says he was ‘dealt the worst hand’ | -0.6249 |
1165 | 2023-05-31 | Brown-Forman to invest $200 million to expand Tequila distillery in Jalisco, Mexico | 0.3182 |
1166 | 2023-06-02 | Zelle and Chase working to resolve duplicate-payments issue | 0.3818 |
1167 | 2023-06-03 | The ‘best job in America’ pays over $120,000 a year, offers low stress, healthy work-life balance — and its workers are in high demand | -0.4019 |
1168 | 2023-06-05 | Cava sets IPO terms, as restaurant chain is set to be valued at more than $2 billion | 0.4404 |
1169 rows × 3 columns
#Vader gives a sentiment value between -1 and 1, -1 being the most negative,
#1 being the most positive and 0 being neutral. We will put them in the corresponding
# pos, neu or neg bins based on this value.
def bins(value):
hold_sent = []
if -1 <= value < -0.33:
hold_sent.append('neg')
if -0.33<=value<=0.33:
hold_sent.append('neu')
if 0.33 < value <= 1:
hold_sent.append('pos')
return hold_sent
markets['sentiment'] = markets['VaderSent'].apply(bins)
values=[]
for i in markets['sentiment'].values:
values.append(i[0])
markets['sentiment']=values
markets.head(10)
Date Created | Tweets | VaderSent | sentiment | |
---|---|---|---|---|
0 | 2020-06-06 | What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH: | -0.6310 | neg |
1 | 2020-06-08 | Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says | 0.0000 | neu |
2 | 2020-06-09 | HD Supply misses on profit expectations but beats on sales | 0.1280 | neu |
3 | 2020-06-11 | Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel. | 0.0000 | neu |
4 | 2020-06-11 | Should I tell my sister that her husband, a notorious spender, has a secret credit card? | -0.0772 | neu |
5 | 2020-06-12 | Palantir Technologies Inc. is reportedly preparing to confidentially file for its long-awaited IPO. | 0.0000 | neu |
6 | 2020-06-15 | It's a well-known secret of Wall Street: Little business actually takes place in New York. We spoke to several Wall Street road warriors to learn what the past 3 months have been like when they can't fly anywhere. | 0.3612 | pos |
7 | 2020-06-16 | Dear airline passengers: Wear your face mask or you might get banned from flying. | -0.1027 | neu |
8 | 2020-06-18 | Facebook takes down Trump-Pence ads featuring symbols previously used by Nazis | 0.0000 | neu |
9 | 2020-06-18 | Dow opens with 170 point drop as jobless claims stay elevated | -0.2732 | neu |
#Each week determine the weight of positive, neutral and negative tweets:
tweets_weight = pd.DataFrame(markets.groupby([pd.Grouper(key='Date Created', freq='W')])['sentiment'], columns=['Date','Data'])
tweets_weight.head()
for i in range(len(tweets_weight)):
n = dict(tweets_weight.loc[i,'Data'].value_counts(normalize=True))
if 'neg' not in n:
n['neg']=0
if 'pos' not in n:
n['pos']=0
if 'neu' not in n:
n['neu']=0
tweets_weight.loc[i,'tweet_pos'] = n['pos']
tweets_weight.loc[i,'tweet_neg'] = n['neg']
tweets_weight.loc[i,'tweet_neu'] = n['neu']
tweets_weight
Date | Data | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|
0 | 2020-06-07 | 0 neg Name: sentiment, dtype: object | 0.000000 | 1.000000 | 0.000000 |
1 | 2020-06-14 | 1 neu 2 neu 3 neu 4 neu 5 neu Name: sentiment, dtype: object | 0.000000 | 0.000000 | 1.000000 |
2 | 2020-06-21 | 6 pos 7 neu 8 neu 9 neu 10 pos 11 neu Name: sentiment, dtype: object | 0.333333 | 0.000000 | 0.666667 |
3 | 2020-06-28 | 12 neu 13 neu 14 neu 15 neu 16 neg 17 neu 18 neu 19 neg 20 neu Name: sentiment, dtype: object | 0.000000 | 0.222222 | 0.777778 |
4 | 2020-07-05 | 21 neg 22 neg 23 neg 24 neg 25 pos 26 neu 27 neg 28 neu 29 neu 30 neu Name: sentiment, dtype: object | 0.100000 | 0.500000 | 0.400000 |
... | ... | ... | ... | ... | ... |
153 | 2023-05-14 | 1141 pos 1142 neu 1143 neu 1144 neu 1145 neg Name: sentiment, dtype: object | 0.200000 | 0.200000 | 0.600000 |
154 | 2023-05-21 | 1146 neu 1147 neu 1148 neu 1149 neu 1150 neu 1151 neu 1152 neg 1153 neu 1154 neg 1155 neu 1156 pos 1157 neu 1158 neu 1159 neu 1160 neu Name: sentiment, dtype: object | 0.066667 | 0.133333 | 0.800000 |
155 | 2023-05-28 | 1161 neg 1162 neg 1163 neg Name: sentiment, dtype: object | 0.000000 | 1.000000 | 0.000000 |
156 | 2023-06-04 | 1164 neg 1165 neu 1166 pos 1167 neg Name: sentiment, dtype: object | 0.250000 | 0.500000 | 0.250000 |
157 | 2023-06-11 | 1168 pos Name: sentiment, dtype: object | 1.000000 | 0.000000 | 0.000000 |
158 rows × 5 columns
tweets_weight.drop('Data', inplace=True, axis=1)
tweets_weight.head()
Date | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|
0 | 2020-06-07 | 0.000000 | 1.000000 | 0.000000 |
1 | 2020-06-14 | 0.000000 | 0.000000 | 1.000000 |
2 | 2020-06-21 | 0.333333 | 0.000000 | 0.666667 |
3 | 2020-06-28 | 0.000000 | 0.222222 | 0.777778 |
4 | 2020-07-05 | 0.100000 | 0.500000 | 0.400000 |
tweets_weight.plot(x='Date', y=['tweet_pos','tweet_neu','tweet_neg'],title='Tweets\' Sentiment')
<AxesSubplot: title={'center': "Tweets' Sentiment"}, xlabel='Date'>
Tweets were mostly neutral in tone. Unsurprisingly, there is a negative correlation between positive and negative tweets.
df_news['VaderSent'] = df_news['Headline'].apply(vader)
df_news.head(10)
Date | Headline | VaderSent | |
---|---|---|---|
0 | 2020-06-08 | Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... | 0.4939 |
1 | 2020-06-17 | Here's the Defining Characteristic of Home Depot's Success | 0.5719 |
2 | 2020-06-17 | Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more | 0.0000 |
3 | 2020-06-18 | Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more | 0.2500 |
4 | 2020-06-24 | 1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa | -0.8860 |
5 | 2020-06-25 | 'It's dehumanizing': Home Depot employee felt colleague's racist ... | -0.8126 |
6 | 2020-06-30 | Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna | 0.2023 |
7 | 2020-06-30 | The Home Depot Announces Renewable Energy Goal and Pledges ... | 0.2732 |
8 | 2020-07-01 | Home Depot bans some rope sales after nooses were found tied on ... | 0.0000 |
9 | 2020-07-02 | Home Depot changes rope sales practice after nooses are found in store | 0.0000 |
df_news['Date'] = pd.to_datetime(df_news['Date'])
df_news['sentiment'] = df_news['VaderSent'].apply(bins)
values=[]
for i in df_news['sentiment'].values:
values.append(i[0])
df_news['sentiment']=values
df_news.head(10)
Date | Headline | VaderSent | sentiment | |
---|---|---|---|---|
0 | 2020-06-08 | Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... | 0.4939 | pos |
1 | 2020-06-17 | Here's the Defining Characteristic of Home Depot's Success | 0.5719 | pos |
2 | 2020-06-17 | Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more | 0.0000 | neu |
3 | 2020-06-18 | Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more | 0.2500 | neu |
4 | 2020-06-24 | 1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa | -0.8860 | neg |
5 | 2020-06-25 | 'It's dehumanizing': Home Depot employee felt colleague's racist ... | -0.8126 | neg |
6 | 2020-06-30 | Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna | 0.2023 | neu |
7 | 2020-06-30 | The Home Depot Announces Renewable Energy Goal and Pledges ... | 0.2732 | neu |
8 | 2020-07-01 | Home Depot bans some rope sales after nooses were found tied on ... | 0.0000 | neu |
9 | 2020-07-02 | Home Depot changes rope sales practice after nooses are found in store | 0.0000 | neu |
news_weight = pd.DataFrame(df_news.groupby([pd.Grouper(key='Date', freq='W')])['sentiment'], columns=['Date','Data'])
for i in range(len(news_weight)):
n = dict(news_weight.loc[i,'Data'].value_counts(normalize=True))
if 'neg' not in n:
n['neg']=0
if 'pos' not in n:
n['pos']=0
if 'neu' not in n:
n['neu']=0
news_weight.loc[i,'news_pos'] = n['pos']
news_weight.loc[i,'news_neg'] = n['neg']
news_weight.loc[i,'news_neu'] = n['neu']
news_weight
Date | Data | news_pos | news_neg | news_neu | |
---|---|---|---|---|---|
0 | 2020-06-14 | 0 pos Name: sentiment, dtype: object | 1.000000 | 0.000000 | 0.000000 |
1 | 2020-06-21 | 1 pos 2 neu 3 neu Name: sentiment, dtype: object | 0.333333 | 0.000000 | 0.666667 |
2 | 2020-06-28 | 4 neg 5 neg Name: sentiment, dtype: object | 0.000000 | 1.000000 | 0.000000 |
3 | 2020-07-05 | 6 neu 7 neu 8 neu 9 neu 10 neu 11 neg Name: sentiment, dtype: object | 0.000000 | 0.166667 | 0.833333 |
4 | 2020-07-12 | 12 neu 13 neu Name: sentiment, dtype: object | 0.000000 | 0.000000 | 1.000000 |
... | ... | ... | ... | ... | ... |
152 | 2023-05-14 | 1546 pos 1547 neu 1548 neu 1549 pos 1550 pos 1551 neu 1552 neu 1562 neu 1570 neu 1571 neg 1577 neu 1578 neu 1579 neu 1581 neu 1584 neg 1586 neu Name: sentiment, dtype: object | 0.187500 | 0.125000 | 0.687500 |
153 | 2023-05-21 | 1587 neg 1589 neu 1590 neu 1592 neu 1593 neg 1594 neg 1595 neu 1596 neg 1597 neu 1598 neu 1599 neu 1600 neu 1601 pos 1602 neu 1603 neu 1604 neu 1605 neu 1606 neu 1608 neu 1609 neg 1610 neu 1611 neg 1614 neg 1615 pos 1616 neu 1619 neu 1620 neu 1621 neu 1623 pos 1631 neu 1632 neu 1633 neu Name: sentiment, dtype: object | 0.093750 | 0.218750 | 0.687500 |
154 | 2023-05-28 | 1634 neu 1638 pos 1640 pos 1641 neu 1642 neu 1643 neg 1646 neu 1647 neu 1648 neu 1649 neg 1650 neu 1658 neu 1659 neg 1666 neu 1667 neu 1668 neu 1669 neu 1670 neu 1671 neg 1672 neu 1673 neg 1674 neg 1676 pos Name: sentiment, dtype: object | 0.130435 | 0.260870 | 0.608696 |
155 | 2023-06-04 | 1677 pos 1679 neg 1680 neu 1682 neu 1683 neg 1685 neu 1692 neu 1694 neg 1696 neg 1698 pos 1699 pos 1700 pos 1704 pos 1705 neu 1706 neg 1708 neu 1710 pos 1711 neu 1713 neu 1716 neu 1718 neu 1719 neg 1721 neu 1722 neu 1724 neu Name: sentiment, dtype: object | 0.240000 | 0.240000 | 0.520000 |
156 | 2023-06-11 | 1725 pos 1726 neu 1727 neu 1728 neu 1731 neg 1732 neu 1733 neg 1734 neu 1735 neu 1736 neg 1737 neg Name: sentiment, dtype: object | 0.090909 | 0.363636 | 0.545455 |
157 rows × 5 columns
news_weight.drop('Data', inplace=True, axis=1)
news_weight
Date | news_pos | news_neg | news_neu | |
---|---|---|---|---|
0 | 2020-06-14 | 1.000000 | 0.000000 | 0.000000 |
1 | 2020-06-21 | 0.333333 | 0.000000 | 0.666667 |
2 | 2020-06-28 | 0.000000 | 1.000000 | 0.000000 |
3 | 2020-07-05 | 0.000000 | 0.166667 | 0.833333 |
4 | 2020-07-12 | 0.000000 | 0.000000 | 1.000000 |
... | ... | ... | ... | ... |
152 | 2023-05-14 | 0.187500 | 0.125000 | 0.687500 |
153 | 2023-05-21 | 0.093750 | 0.218750 | 0.687500 |
154 | 2023-05-28 | 0.130435 | 0.260870 | 0.608696 |
155 | 2023-06-04 | 0.240000 | 0.240000 | 0.520000 |
156 | 2023-06-11 | 0.090909 | 0.363636 | 0.545455 |
157 rows × 4 columns
news_weight.plot(x='Date', y=['news_pos','news_neg','news_neu'], title='News Sentiment')
<AxesSubplot: title={'center': 'News Sentiment'}, xlabel='Date'>
News were also mostly neutral in tone. Again, there is a negative correlation between positive and negative tweets.
df_combined.drop('ticker', inplace=True, axis=1)
df_combined.head()
open | high | low | close | adjclose | volume | TwoWeeks | Month | Home Depot | |
---|---|---|---|---|---|---|---|---|---|
2020-07-06 | 250.270004 | 251.500000 | 247.039993 | 249.550003 | 232.537186 | 3133800.0 | 247.481999 | 248.1370 | 90.0 |
2020-07-07 | 247.369995 | 250.779999 | 247.070007 | 247.350006 | 230.487167 | 2927800.0 | 247.300999 | 247.6660 | 90.0 |
2020-07-08 | 247.869995 | 249.789993 | 246.220001 | 249.169998 | 232.183075 | 2294000.0 | 247.187000 | 247.2865 | 90.0 |
2020-07-09 | 249.660004 | 250.509995 | 246.350006 | 247.960007 | 231.055588 | 2994700.0 | 247.370000 | 246.9620 | 90.0 |
2020-07-10 | 248.289993 | 250.330002 | 246.639999 | 250.110001 | 233.058990 | 2745300.0 | 247.842999 | 247.4940 | 90.0 |
df_combined = df_combined.reset_index()
df_combined.rename(columns={'index':'Date'}, inplace=True)
df_combined.head()
Date | open | high | low | close | adjclose | volume | TwoWeeks | Month | Home Depot | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-07-06 | 250.270004 | 251.500000 | 247.039993 | 249.550003 | 232.537186 | 3133800.0 | 247.481999 | 248.1370 | 90.0 |
1 | 2020-07-07 | 247.369995 | 250.779999 | 247.070007 | 247.350006 | 230.487167 | 2927800.0 | 247.300999 | 247.6660 | 90.0 |
2 | 2020-07-08 | 247.869995 | 249.789993 | 246.220001 | 249.169998 | 232.183075 | 2294000.0 | 247.187000 | 247.2865 | 90.0 |
3 | 2020-07-09 | 249.660004 | 250.509995 | 246.350006 | 247.960007 | 231.055588 | 2994700.0 | 247.370000 | 246.9620 | 90.0 |
4 | 2020-07-10 | 248.289993 | 250.330002 | 246.639999 | 250.110001 | 233.058990 | 2745300.0 | 247.842999 | 247.4940 | 90.0 |
import numpy as np
df_weekly = pd.DataFrame(df_combined.groupby([pd.Grouper(key='Date', freq='W')]).agg(np.mean))
df_weekly.head()
open | high | low | close | adjclose | volume | TwoWeeks | Month | Home Depot | |
---|---|---|---|---|---|---|---|---|---|
Date | |||||||||
2020-07-12 | 248.691998 | 250.581998 | 246.664001 | 248.828003 | 231.864401 | 2819120.0 | 247.436599 | 247.509100 | 90.0 |
2020-07-19 | 255.570004 | 259.695996 | 253.284000 | 256.733997 | 239.231400 | 3799580.0 | 250.709800 | 249.001500 | 86.0 |
2020-07-26 | 263.217999 | 265.070001 | 261.105994 | 263.376007 | 245.420612 | 2661120.0 | 256.946602 | 252.191601 | 85.0 |
2020-08-02 | 265.357996 | 267.876001 | 263.217999 | 265.832001 | 247.709158 | 2700400.0 | 263.277204 | 256.993502 | 85.0 |
2020-08-09 | 267.792004 | 270.044006 | 266.348004 | 268.508002 | 250.202713 | 2319420.0 | 266.128600 | 261.537601 | 80.0 |
df_final = df_weekly.merge(news_weight, on='Date').merge(tweets_weight, on='Date')
df_final.columns = [i.lower() for i in df_final.columns]
df_final.head()
date | open | high | low | close | adjclose | volume | twoweeks | month | home depot | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-07-12 | 248.691998 | 250.581998 | 246.664001 | 248.828003 | 231.864401 | 2819120.0 | 247.436599 | 247.509100 | 90.0 | 0.000000 | 0.00 | 1.000000 | 0.000000 | 0.166667 | 0.833333 |
1 | 2020-07-19 | 255.570004 | 259.695996 | 253.284000 | 256.733997 | 239.231400 | 3799580.0 | 250.709800 | 249.001500 | 86.0 | 0.000000 | 0.75 | 0.250000 | 0.000000 | 0.400000 | 0.600000 |
2 | 2020-07-26 | 263.217999 | 265.070001 | 261.105994 | 263.376007 | 245.420612 | 2661120.0 | 256.946602 | 252.191601 | 85.0 | 0.333333 | 0.00 | 0.666667 | 0.125000 | 0.125000 | 0.750000 |
3 | 2020-08-02 | 265.357996 | 267.876001 | 263.217999 | 265.832001 | 247.709158 | 2700400.0 | 263.277204 | 256.993502 | 85.0 | 0.200000 | 0.20 | 0.600000 | 0.181818 | 0.272727 | 0.545455 |
4 | 2020-08-09 | 267.792004 | 270.044006 | 266.348004 | 268.508002 | 250.202713 | 2319420.0 | 266.128600 | 261.537601 | 80.0 | 0.333333 | 0.00 | 0.666667 | 0.333333 | 0.222222 | 0.444444 |
df_final.rename(columns={"home depot":"trends"}, inplace=True)
df_final.head()
date | open | high | low | close | adjclose | volume | twoweeks | month | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-07-12 | 248.691998 | 250.581998 | 246.664001 | 248.828003 | 231.864401 | 2819120.0 | 247.436599 | 247.509100 | 90.0 | 0.000000 | 0.00 | 1.000000 | 0.000000 | 0.166667 | 0.833333 |
1 | 2020-07-19 | 255.570004 | 259.695996 | 253.284000 | 256.733997 | 239.231400 | 3799580.0 | 250.709800 | 249.001500 | 86.0 | 0.000000 | 0.75 | 0.250000 | 0.000000 | 0.400000 | 0.600000 |
2 | 2020-07-26 | 263.217999 | 265.070001 | 261.105994 | 263.376007 | 245.420612 | 2661120.0 | 256.946602 | 252.191601 | 85.0 | 0.333333 | 0.00 | 0.666667 | 0.125000 | 0.125000 | 0.750000 |
3 | 2020-08-02 | 265.357996 | 267.876001 | 263.217999 | 265.832001 | 247.709158 | 2700400.0 | 263.277204 | 256.993502 | 85.0 | 0.200000 | 0.20 | 0.600000 | 0.181818 | 0.272727 | 0.545455 |
4 | 2020-08-09 | 267.792004 | 270.044006 | 266.348004 | 268.508002 | 250.202713 | 2319420.0 | 266.128600 | 261.537601 | 80.0 | 0.333333 | 0.00 | 0.666667 | 0.333333 | 0.222222 | 0.444444 |
#Scaling the data
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()
df_final_scaled = pd.DataFrame(SS.fit_transform(df_final.iloc[:,1:]), columns=df_final.columns[1:])
df_final_scaled.insert(0, "date", df_final['date'])
df_final_scaled.head()
date | open | high | low | close | adjclose | volume | twoweeks | month | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-07-12 | -1.703818 | -1.730261 | -1.682950 | -1.705764 | -1.872820 | -0.840392 | -1.738792 | -1.736071 | 2.580289 | -1.106820 | -1.193801 | 1.766450 | -1.503002 | -0.103907 | 1.464049 |
1 | 2020-07-19 | -1.505790 | -1.470011 | -1.490960 | -1.477822 | -1.658905 | -0.124160 | -1.644438 | -1.692888 | 2.198301 | -1.106820 | 3.504942 | -1.899286 | -1.503002 | 1.212374 | 0.306837 |
2 | 2020-07-26 | -1.285591 | -1.316556 | -1.264110 | -1.286323 | -1.479190 | -0.955812 | -1.464655 | -1.600581 | 2.102805 | 1.052492 | -1.193801 | 0.137234 | -0.824220 | -0.338957 | 1.050759 |
3 | 2020-08-02 | -1.223978 | -1.236431 | -1.202859 | -1.215514 | -1.412737 | -0.927118 | -1.282168 | -1.461636 | 2.102805 | 0.188767 | 0.059197 | -0.188609 | -0.515683 | 0.494403 | 0.036320 |
4 | 2020-08-09 | -1.153899 | -1.174523 | -1.112084 | -1.138360 | -1.340332 | -1.205426 | -1.199974 | -1.330150 | 1.625320 | 1.052492 | -1.193801 | 0.137234 | 0.307084 | 0.209494 | -0.464638 |
df_final_scaled.corr().round(3)
<ipython-input-214-f6a92ee3555b>:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. df_final_scaled.corr().round(3)
open | high | low | close | adjclose | volume | twoweeks | month | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
open | 1.000 | 0.999 | 0.999 | 0.998 | 0.986 | -0.051 | 0.983 | 0.954 | -0.221 | 0.241 | -0.043 | -0.148 | 0.009 | -0.075 | 0.058 |
high | 0.999 | 1.000 | 0.998 | 0.999 | 0.988 | -0.040 | 0.982 | 0.954 | -0.227 | 0.246 | -0.038 | -0.156 | 0.010 | -0.074 | 0.056 |
low | 0.999 | 0.998 | 1.000 | 0.999 | 0.985 | -0.075 | 0.978 | 0.947 | -0.210 | 0.238 | -0.045 | -0.145 | 0.008 | -0.079 | 0.062 |
close | 0.998 | 0.999 | 0.999 | 1.000 | 0.988 | -0.059 | 0.978 | 0.949 | -0.218 | 0.245 | -0.041 | -0.152 | 0.009 | -0.077 | 0.059 |
adjclose | 0.986 | 0.988 | 0.985 | 0.988 | 1.000 | -0.042 | 0.970 | 0.945 | -0.321 | 0.267 | -0.013 | -0.192 | 0.011 | -0.072 | 0.053 |
volume | -0.051 | -0.040 | -0.075 | -0.059 | -0.042 | 1.000 | 0.018 | 0.058 | -0.057 | 0.048 | -0.005 | -0.032 | 0.014 | 0.090 | -0.092 |
twoweeks | 0.983 | 0.982 | 0.978 | 0.978 | 0.970 | 0.018 | 1.000 | 0.986 | -0.266 | 0.248 | -0.035 | -0.160 | 0.006 | -0.077 | 0.062 |
month | 0.954 | 0.954 | 0.947 | 0.949 | 0.945 | 0.058 | 0.986 | 1.000 | -0.311 | 0.229 | -0.034 | -0.146 | -0.003 | -0.074 | 0.068 |
trends | -0.221 | -0.227 | -0.210 | -0.218 | -0.321 | -0.057 | -0.266 | -0.311 | 1.000 | -0.227 | -0.072 | 0.227 | -0.015 | 0.006 | 0.008 |
news_pos | 0.241 | 0.246 | 0.238 | 0.245 | 0.267 | 0.048 | 0.248 | 0.229 | -0.227 | 1.000 | -0.151 | -0.637 | -0.074 | 0.002 | 0.066 |
news_neg | -0.043 | -0.038 | -0.045 | -0.041 | -0.013 | -0.005 | -0.035 | -0.034 | -0.072 | -0.151 | 1.000 | -0.666 | -0.004 | -0.005 | 0.008 |
news_neu | -0.148 | -0.156 | -0.145 | -0.152 | -0.192 | -0.032 | -0.160 | -0.146 | 0.227 | -0.637 | -0.666 | 1.000 | 0.059 | 0.003 | -0.056 |
tweet_pos | 0.009 | 0.010 | 0.008 | 0.009 | 0.011 | 0.014 | 0.006 | -0.003 | -0.015 | -0.074 | -0.004 | 0.059 | 1.000 | -0.378 | -0.581 |
tweet_neg | -0.075 | -0.074 | -0.079 | -0.077 | -0.072 | 0.090 | -0.077 | -0.074 | 0.006 | 0.002 | -0.005 | 0.003 | -0.378 | 1.000 | -0.534 |
tweet_neu | 0.058 | 0.056 | 0.062 | 0.059 | 0.053 | -0.092 | 0.062 | 0.068 | 0.008 | 0.066 | 0.008 | -0.056 | -0.581 | -0.534 | 1.000 |
There is a positive correlation between positive news and the stock price in two weeks. We don't see this correlation with positive tweets or any other sentiment columns. Additionally, there is a negative correlation between trends and the stock price in two weeks. This means when people start searching Home Depot on Google News, stock prices starting falling in two weeks.
cols=['news_pos', 'news_neg','news_neu', 'tweet_pos', 'tweet_neg','tweet_neu', 'trends']
fig, ax =plt.subplots(3,3, figsize=(15, 12))
ax=ax.ravel()
for i in range(len(cols)):
df_final_scaled.plot(x='date', y=['close','twoweeks','month', cols[i]], ax=ax[i], title=cols[i])
ax[7].axis('off')
ax[8].axis('off')
plt.tight_layout()
\We can observe the correlation mentioned above in the plots as well. The first plot, representing positive news, indicates that positive news loosely follows the stock price. On the other hand, the last plot depicting trends shows a negative correlation between Google searches and the stock price. Furthermore, over time, the number of Google searches for "Home Depot" has steadily declined.
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use("ggplot")
df_final_scaled = pd.read_csv('df_final_home_depot_scaled.csv')
df_final_scaled.drop('Unnamed: 0', inplace=True, axis=1)
df_final_scaled.head()
date | open | high | low | close | adjclose | volume | twoweeks | month | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-07-12 | -1.703818 | -1.730261 | -1.682950 | -1.705764 | -1.872820 | -0.840392 | -1.738792 | -1.736071 | 2.580289 | -1.106820 | -1.193801 | 1.766450 | -1.503002 | -0.103907 | 1.464049 |
1 | 2020-07-19 | -1.505790 | -1.470011 | -1.490960 | -1.477822 | -1.658905 | -0.124160 | -1.644438 | -1.692888 | 2.198301 | -1.106820 | 3.504942 | -1.899286 | -1.503002 | 1.212374 | 0.306837 |
2 | 2020-07-26 | -1.285591 | -1.316556 | -1.264110 | -1.286323 | -1.479190 | -0.955812 | -1.464655 | -1.600581 | 2.102805 | 1.052492 | -1.193801 | 0.137234 | -0.824220 | -0.338957 | 1.050759 |
3 | 2020-08-02 | -1.223978 | -1.236431 | -1.202859 | -1.215514 | -1.412737 | -0.927118 | -1.282168 | -1.461636 | 2.102805 | 0.188767 | 0.059197 | -0.188609 | -0.515683 | 0.494403 | 0.036320 |
4 | 2020-08-09 | -1.153899 | -1.174523 | -1.112084 | -1.138360 | -1.340332 | -1.205426 | -1.199974 | -1.330150 | 1.625320 | 1.052492 | -1.193801 | 0.137234 | 0.307084 | 0.209494 | -0.464638 |
x= df_final_scaled[['open','high','low','close', 'adjclose','volume','news_pos', 'news_neg', 'news_neu', 'tweet_pos',
'tweet_neg', 'tweet_neu', 'trends']]
y=df_final_scaled['twoweeks']
import statsmodels.api as sm
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: twoweeks R-squared: 0.977 Model: OLS Adj. R-squared: 0.975 Method: Least Squares F-statistic: 537.8 Date: Mon, 12 Jun 2023 Prob (F-statistic): 3.14e-109 Time: 02:36:26 Log-Likelihood: 70.561 No. Observations: 153 AIC: -117.1 Df Residuals: 141 BIC: -80.76 Df Model: 11 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const -1.501e-16 0.013 -1.17e-14 1.000 -0.025 0.025 open 2.3468 0.511 4.594 0.000 1.337 3.357 high -0.2428 0.541 -0.448 0.655 -1.313 0.828 low -0.7347 0.559 -1.314 0.191 -1.840 0.371 close -0.2749 0.545 -0.504 0.615 -1.352 0.802 adjclose -0.1252 0.118 -1.064 0.289 -0.358 0.107 volume 0.0494 0.017 2.981 0.003 0.017 0.082 news_pos 0.0014 0.010 0.141 0.888 -0.019 0.022 news_neg 0.0043 0.010 0.438 0.662 -0.015 0.023 news_neu -0.0044 0.007 -0.594 0.554 -0.019 0.010 tweet_pos -0.0020 0.009 -0.229 0.819 -0.019 0.015 tweet_neg -0.0071 0.009 -0.778 0.438 -0.025 0.011 tweet_neu 0.0081 0.008 1.009 0.315 -0.008 0.024 trends -0.0524 0.019 -2.785 0.006 -0.090 -0.015 ============================================================================== Omnibus: 1.461 Durbin-Watson: 1.694 Prob(Omnibus): 0.482 Jarque-Bera (JB): 1.534 Skew: 0.193 Prob(JB): 0.464 Kurtosis: 2.697 Cond. No. 5.93e+15 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 2.26e-29. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
x= df_final_scaled[['open','high','low','close', 'adjclose','volume']]
y=df_final_scaled['twoweeks']
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3 , shuffle=False,random_state = 0)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
regression = LinearRegression()
regression.fit(train_x, train_y)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)
regression coefficient [ 2.15513818 0.1568129 -0.63183193 -0.94351993 0.2494587 0.05989017] regression intercept 0.017827179785784394
regression_confidence = regression.score(test_x, test_y)
print("linear regression confidence: ", regression_confidence)
linear regression confidence: 0.8354280496114086
predicted=regression.predict(test_x)
print(test_x.head())
open high low close adjclose volume 107 -0.243796 -0.255966 -0.239776 -0.208430 -0.104242 -0.699127 108 -0.115385 -0.079953 -0.082182 -0.058794 0.042456 -0.785429 109 0.099918 0.070874 0.110736 0.082941 0.181410 -0.899154 110 0.366759 0.437578 0.380624 0.428573 0.520257 0.542134 111 0.080282 0.039007 0.007433 -0.028003 0.072644 -0.737113
dfr=pd.DataFrame({'Actual_Price':test_y, 'Predicted_Price':predicted})
dfr.head(10)
Actual_Price | Predicted_Price | |
---|---|---|
107 | -0.251375 | -0.267445 |
108 | -0.142925 | -0.172431 |
109 | -0.051125 | 0.087459 |
110 | 0.166720 | 0.394254 |
111 | 0.250333 | 0.192664 |
112 | -0.043019 | -0.309534 |
113 | -0.333535 | -0.397486 |
114 | -0.476311 | -0.423891 |
115 | -0.740981 | -0.954391 |
116 | -0.992016 | -0.840699 |
dfr.describe()
Actual_Price | Predicted_Price | |
---|---|---|
count | 46.000000 | 46.000000 |
mean | -0.191756 | -0.132461 |
std | 0.438800 | 0.433547 |
min | -0.992016 | -0.954391 |
25% | -0.502699 | -0.417290 |
50% | -0.331794 | -0.271980 |
75% | 0.274308 | 0.247046 |
max | 0.478988 | 0.858482 |
from sklearn import metrics
import numpy as np
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test_y, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(test_y, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(test_y, predicted)))
Mean Absolute Error (MAE): 0.14249150773348154 Mean Squared Error (MSE) : 0.030998755667901733 Root Mean Squared Error (RMSE): 0.17606463491542454
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price, color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Prediction using only Historical Data")
plt.legend()
<matplotlib.legend.Legend at 0x7fc3b62ce770>
df_final_scaled.columns
Index(['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume', 'twoweeks', 'month', 'trends', 'news_pos', 'news_neg', 'news_neu', 'tweet_pos', 'tweet_neg', 'tweet_neu'], dtype='object')
x= df_final_scaled[['open','high','low','close', 'adjclose','volume','news_pos', 'news_neg', 'news_neu', 'tweet_pos',
'tweet_neg', 'tweet_neu', 'trends']]
y=df_final_scaled['twoweeks']
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3 , shuffle=False,random_state = 0)
regression = LinearRegression()
regression.fit(train_x, train_y)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)
regression coefficient [ 1.91871810e+00 1.89637622e-01 -3.49725608e-01 -8.35917172e-01 4.63540880e-02 6.48403899e-02 1.45467759e-03 5.40953328e-03 -5.31782650e-03 -3.13379351e-03 -7.57685954e-03 9.52332816e-03 -5.55338621e-02] regression intercept 0.023765631258188702
regression_confidence = regression.score(test_x, test_y)
print("linear regression confidence: ", regression_confidence)
linear regression confidence: 0.8294859517339385
predicted=regression.predict(test_x)
print(test_x.head())
open high low close adjclose volume news_pos \ 107 -0.243796 -0.255966 -0.239776 -0.208430 -0.104242 -0.699127 0.332721 108 -0.115385 -0.079953 -0.082182 -0.058794 0.042456 -0.785429 -1.106820 109 0.099918 0.070874 0.110736 0.082941 0.181410 -0.899154 0.332721 110 0.366759 0.437578 0.380624 0.428573 0.520257 0.542134 0.512664 111 0.080282 0.039007 0.007433 -0.028003 0.072644 -0.737113 0.512664 news_neg news_neu tweet_pos tweet_neg tweet_neu trends 107 0.894529 -0.948910 0.533344 1.071344 -1.428981 -0.475612 108 0.894529 0.137234 0.824251 -1.044107 0.165137 -0.380115 109 0.894529 -0.948910 -1.503002 0.567665 0.873634 -0.666606 110 -0.802239 0.239060 0.471637 -0.018434 -0.414542 -0.666606 111 0.372447 -0.677374 1.890908 -0.338957 -1.428981 -0.475612
from sklearn.metrics import r2_score
r2_score(predicted, test_y)
0.8290817124619406
dfr=pd.DataFrame({'Actual_Price':test_y, 'Predicted_Price':predicted})
dfr.head(10)
Actual_Price | Predicted_Price | |
---|---|---|
107 | -0.251375 | -0.271245 |
108 | -0.142925 | -0.153351 |
109 | -0.051125 | 0.127086 |
110 | 0.166720 | 0.405224 |
111 | 0.250333 | 0.177392 |
112 | -0.043019 | -0.300313 |
113 | -0.333535 | -0.381184 |
114 | -0.476311 | -0.385414 |
115 | -0.740981 | -0.939092 |
116 | -0.992016 | -0.805985 |
dfr.describe()
Actual_Price | Predicted_Price | |
---|---|---|
count | 46.000000 | 46.000000 |
mean | -0.191756 | -0.112709 |
std | 0.438800 | 0.438281 |
min | -0.992016 | -0.939092 |
25% | -0.502699 | -0.419594 |
50% | -0.331794 | -0.269632 |
75% | 0.274308 | 0.268095 |
max | 0.478988 | 0.895990 |
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test_y, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(test_y, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(test_y, predicted)))
Mean Absolute Error (MAE): 0.14758907019543632 Mean Squared Error (MSE) : 0.0321180086136408 Root Mean Squared Error (RMSE): 0.17921497876472492
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price, color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Predictions using Historical, Sentiment and Trends")
plt.legend()
<matplotlib.legend.Legend at 0x7fc3b6000850>
import pandas as pd
df = pd.read_csv('df_final_home_depot_scaled.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)
df.head()
date | open | high | low | close | adjclose | volume | twoweeks | month | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-07-12 | -1.703818 | -1.730261 | -1.682950 | -1.705764 | -1.872820 | -0.840392 | -1.738792 | -1.736071 | 2.580289 | -1.106820 | -1.193801 | 1.766450 | -1.503002 | -0.103907 | 1.464049 |
1 | 2020-07-19 | -1.505790 | -1.470011 | -1.490960 | -1.477822 | -1.658905 | -0.124160 | -1.644438 | -1.692888 | 2.198301 | -1.106820 | 3.504942 | -1.899286 | -1.503002 | 1.212374 | 0.306837 |
2 | 2020-07-26 | -1.285591 | -1.316556 | -1.264110 | -1.286323 | -1.479190 | -0.955812 | -1.464655 | -1.600581 | 2.102805 | 1.052492 | -1.193801 | 0.137234 | -0.824220 | -0.338957 | 1.050759 |
3 | 2020-08-02 | -1.223978 | -1.236431 | -1.202859 | -1.215514 | -1.412737 | -0.927118 | -1.282168 | -1.461636 | 2.102805 | 0.188767 | 0.059197 | -0.188609 | -0.515683 | 0.494403 | 0.036320 |
4 | 2020-08-09 | -1.153899 | -1.174523 | -1.112084 | -1.138360 | -1.340332 | -1.205426 | -1.199974 | -1.330150 | 1.625320 | 1.052492 | -1.193801 | 0.137234 | 0.307084 | 0.209494 | -0.464638 |
df.columns
Index(['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume', 'twoweeks', 'month', 'trends', 'news_pos', 'news_neg', 'news_neu', 'tweet_pos', 'tweet_neg', 'tweet_neu'], dtype='object')
df_hist = df[['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
'twoweeks', 'month']]
df_hist.shape
(153, 9)
#Append each row to X (high, low, open, close, adjclose, volume)
X = [[df_hist.iloc[j,i+1] for i in range(df_hist.shape[1]-3)] for j in range(df_hist.shape[0])]
#Append all "twoweeks" values
Y = [df_hist.iloc[i,7] for i in range(df_hist.shape[0])]
import numpy as np
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)
train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)
train_X = train_X.reshape(train_X.shape[0],1,6,1)
test_X = test_X.reshape(test_X.shape[0],1,6,1)
print(len(train_X))
print(len(test_X))
107 46
train_X[0]
array([[[-1.09683374], [-1.10216509], [-1.12420655], [-1.11575665], [-1.28069411], [-0.48313096]]])
from tensorflow.keras import backend as K
K.image_data_format()=="channels_first"
False
# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError
model = tf.keras.Sequential()
# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None, 6, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
# model.add(Dense(5, kernel_regularizer=L2(0.01)))
# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))
#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])
history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)
Epoch 1/40 3/3 [==============================] - 13s 910ms/step - loss: 0.8740 - mse: 0.8740 - mae: 0.7390 - val_loss: 1.1876 - val_mse: 1.1876 - val_mae: 0.8232 Epoch 2/40 3/3 [==============================] - 0s 34ms/step - loss: 0.8003 - mse: 0.8003 - mae: 0.7047 - val_loss: 1.0375 - val_mse: 1.0375 - val_mae: 0.7627 Epoch 3/40 3/3 [==============================] - 0s 38ms/step - loss: 0.6692 - mse: 0.6692 - mae: 0.6315 - val_loss: 0.7258 - val_mse: 0.7258 - val_mae: 0.6202 Epoch 4/40 3/3 [==============================] - 0s 42ms/step - loss: 0.3853 - mse: 0.3853 - mae: 0.4627 - val_loss: 0.2971 - val_mse: 0.2971 - val_mae: 0.3433 Epoch 5/40 3/3 [==============================] - 0s 41ms/step - loss: 0.1832 - mse: 0.1832 - mae: 0.2912 - val_loss: 0.1127 - val_mse: 0.1127 - val_mae: 0.2656 Epoch 6/40 3/3 [==============================] - 0s 46ms/step - loss: 0.1054 - mse: 0.1054 - mae: 0.2555 - val_loss: 0.2310 - val_mse: 0.2310 - val_mae: 0.3723 Epoch 7/40 3/3 [==============================] - 0s 67ms/step - loss: 0.1590 - mse: 0.1590 - mae: 0.3158 - val_loss: 0.1858 - val_mse: 0.1858 - val_mae: 0.3358 Epoch 8/40 3/3 [==============================] - 0s 60ms/step - loss: 0.1221 - mse: 0.1221 - mae: 0.2605 - val_loss: 0.0740 - val_mse: 0.0740 - val_mae: 0.2339 Epoch 9/40 3/3 [==============================] - 0s 60ms/step - loss: 0.0769 - mse: 0.0769 - mae: 0.2229 - val_loss: 0.0902 - val_mse: 0.0902 - val_mae: 0.2187 Epoch 10/40 3/3 [==============================] - 0s 81ms/step - loss: 0.0740 - mse: 0.0740 - mae: 0.1926 - val_loss: 0.1062 - val_mse: 0.1062 - val_mae: 0.2100 Epoch 11/40 3/3 [==============================] - 0s 64ms/step - loss: 0.0807 - mse: 0.0807 - mae: 0.1986 - val_loss: 0.0893 - val_mse: 0.0893 - val_mae: 0.2025 Epoch 12/40 3/3 [==============================] - 0s 61ms/step - loss: 0.0891 - mse: 0.0891 - mae: 0.2086 - val_loss: 0.0575 - val_mse: 0.0575 - val_mae: 0.1793 Epoch 13/40 3/3 [==============================] - 0s 60ms/step - loss: 0.0599 - mse: 0.0599 - mae: 0.1769 - val_loss: 0.0513 - val_mse: 0.0513 - val_mae: 0.1723 Epoch 14/40 3/3 [==============================] - 0s 65ms/step - loss: 0.0515 - mse: 0.0515 - mae: 0.1749 - val_loss: 0.0740 - val_mse: 0.0740 - val_mae: 0.2093 Epoch 15/40 3/3 [==============================] - 0s 57ms/step - loss: 0.0550 - mse: 0.0550 - mae: 0.1783 - val_loss: 0.0788 - val_mse: 0.0788 - val_mae: 0.2178 Epoch 16/40 3/3 [==============================] - 0s 56ms/step - loss: 0.0655 - mse: 0.0655 - mae: 0.1941 - val_loss: 0.0548 - val_mse: 0.0548 - val_mae: 0.1837 Epoch 17/40 3/3 [==============================] - 0s 60ms/step - loss: 0.0431 - mse: 0.0431 - mae: 0.1603 - val_loss: 0.0412 - val_mse: 0.0412 - val_mae: 0.1610 Epoch 18/40 3/3 [==============================] - 0s 57ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1512 - val_loss: 0.0473 - val_mse: 0.0473 - val_mae: 0.1615 Epoch 19/40 3/3 [==============================] - 0s 60ms/step - loss: 0.0426 - mse: 0.0426 - mae: 0.1599 - val_loss: 0.0502 - val_mse: 0.0502 - val_mae: 0.1643 Epoch 20/40 3/3 [==============================] - 0s 61ms/step - loss: 0.0579 - mse: 0.0579 - mae: 0.1759 - val_loss: 0.0423 - val_mse: 0.0423 - val_mae: 0.1570 Epoch 21/40 3/3 [==============================] - 0s 60ms/step - loss: 0.0507 - mse: 0.0507 - mae: 0.1801 - val_loss: 0.0473 - val_mse: 0.0473 - val_mae: 0.1655 Epoch 22/40 3/3 [==============================] - 0s 71ms/step - loss: 0.0399 - mse: 0.0399 - mae: 0.1639 - val_loss: 0.0588 - val_mse: 0.0588 - val_mae: 0.1883 Epoch 23/40 3/3 [==============================] - 0s 36ms/step - loss: 0.0468 - mse: 0.0468 - mae: 0.1620 - val_loss: 0.0428 - val_mse: 0.0428 - val_mae: 0.1614 Epoch 24/40 3/3 [==============================] - 0s 35ms/step - loss: 0.0417 - mse: 0.0417 - mae: 0.1593 - val_loss: 0.0419 - val_mse: 0.0419 - val_mae: 0.1566 Epoch 25/40 3/3 [==============================] - 0s 40ms/step - loss: 0.0481 - mse: 0.0481 - mae: 0.1619 - val_loss: 0.0418 - val_mse: 0.0418 - val_mae: 0.1554 Epoch 26/40 3/3 [==============================] - 0s 40ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1476 - val_loss: 0.0420 - val_mse: 0.0420 - val_mae: 0.1560 Epoch 27/40 3/3 [==============================] - 0s 38ms/step - loss: 0.0471 - mse: 0.0471 - mae: 0.1628 - val_loss: 0.0429 - val_mse: 0.0429 - val_mae: 0.1625 Epoch 28/40 3/3 [==============================] - 0s 35ms/step - loss: 0.0461 - mse: 0.0461 - mae: 0.1648 - val_loss: 0.0434 - val_mse: 0.0434 - val_mae: 0.1660 Epoch 29/40 3/3 [==============================] - 0s 36ms/step - loss: 0.0360 - mse: 0.0360 - mae: 0.1533 - val_loss: 0.0427 - val_mse: 0.0427 - val_mae: 0.1616 Epoch 30/40 3/3 [==============================] - 0s 36ms/step - loss: 0.0376 - mse: 0.0376 - mae: 0.1511 - val_loss: 0.0431 - val_mse: 0.0431 - val_mae: 0.1578 Epoch 31/40 3/3 [==============================] - 0s 37ms/step - loss: 0.0435 - mse: 0.0435 - mae: 0.1703 - val_loss: 0.0448 - val_mse: 0.0448 - val_mae: 0.1599 Epoch 32/40 3/3 [==============================] - 0s 39ms/step - loss: 0.0414 - mse: 0.0414 - mae: 0.1560 - val_loss: 0.0459 - val_mse: 0.0459 - val_mae: 0.1602 Epoch 33/40 3/3 [==============================] - 0s 44ms/step - loss: 0.0437 - mse: 0.0437 - mae: 0.1640 - val_loss: 0.0477 - val_mse: 0.0477 - val_mae: 0.1672 Epoch 34/40 3/3 [==============================] - 0s 43ms/step - loss: 0.0421 - mse: 0.0421 - mae: 0.1515 - val_loss: 0.0508 - val_mse: 0.0508 - val_mae: 0.1743 Epoch 35/40 3/3 [==============================] - 0s 45ms/step - loss: 0.0427 - mse: 0.0427 - mae: 0.1623 - val_loss: 0.0461 - val_mse: 0.0461 - val_mae: 0.1658 Epoch 36/40 3/3 [==============================] - 0s 34ms/step - loss: 0.0444 - mse: 0.0444 - mae: 0.1668 - val_loss: 0.0426 - val_mse: 0.0426 - val_mae: 0.1588 Epoch 37/40 3/3 [==============================] - 0s 38ms/step - loss: 0.0468 - mse: 0.0468 - mae: 0.1661 - val_loss: 0.0496 - val_mse: 0.0496 - val_mae: 0.1661 Epoch 38/40 3/3 [==============================] - 0s 38ms/step - loss: 0.0431 - mse: 0.0431 - mae: 0.1544 - val_loss: 0.0435 - val_mse: 0.0435 - val_mae: 0.1588 Epoch 39/40 3/3 [==============================] - 0s 36ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1587 - val_loss: 0.0440 - val_mse: 0.0440 - val_mae: 0.1577 Epoch 40/40 3/3 [==============================] - 0s 34ms/step - loss: 0.0546 - mse: 0.0546 - mae: 0.1726 - val_loss: 0.0544 - val_mse: 0.0544 - val_mae: 0.1774
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc355192470>
plt.plot(history.history['mse'], label='train mse')
plt.plot(history.history['val_mse'], label='val mse')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc355252b30>
plt.plot(history.history['mae'], label='train mae')
plt.plot(history.history['val_mae'], label='val mae')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc355588a60>
model.evaluate(test_X, test_Y)
2/2 [==============================] - 0s 10ms/step - loss: 0.0544 - mse: 0.0544 - mae: 0.1774
[0.054426856338977814, 0.054426856338977814, 0.1773832142353058]
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)
r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)
var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)
Variance: 0.962742 R2 Score: 0.962742 Max Error: 0.613890
predicted = model.predict(test_X)
test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
temp = df_hist.iloc[j,3]
test_label[j - len_t] = test_label[j - len_t] * temp + temp
predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()
2/2 [==============================] - 0s 7ms/step
df.head()
date | open | high | low | close | adjclose | volume | twoweeks | month | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-07-12 | -1.703818 | -1.730261 | -1.682950 | -1.705764 | -1.872820 | -0.840392 | -1.738792 | -1.736071 | 2.580289 | -1.106820 | -1.193801 | 1.766450 | -1.503002 | -0.103907 | 1.464049 |
1 | 2020-07-19 | -1.505790 | -1.470011 | -1.490960 | -1.477822 | -1.658905 | -0.124160 | -1.644438 | -1.692888 | 2.198301 | -1.106820 | 3.504942 | -1.899286 | -1.503002 | 1.212374 | 0.306837 |
2 | 2020-07-26 | -1.285591 | -1.316556 | -1.264110 | -1.286323 | -1.479190 | -0.955812 | -1.464655 | -1.600581 | 2.102805 | 1.052492 | -1.193801 | 0.137234 | -0.824220 | -0.338957 | 1.050759 |
3 | 2020-08-02 | -1.223978 | -1.236431 | -1.202859 | -1.215514 | -1.412737 | -0.927118 | -1.282168 | -1.461636 | 2.102805 | 0.188767 | 0.059197 | -0.188609 | -0.515683 | 0.494403 | 0.036320 |
4 | 2020-08-09 | -1.153899 | -1.174523 | -1.112084 | -1.138360 | -1.340332 | -1.205426 | -1.199974 | -1.330150 | 1.625320 | 1.052492 | -1.193801 | 0.137234 | 0.307084 | 0.209494 | -0.464638 |
df_x = df.drop(['date','twoweeks', 'month'], axis=1)
df_x.head()
open | high | low | close | adjclose | volume | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.703818 | -1.730261 | -1.682950 | -1.705764 | -1.872820 | -0.840392 | 2.580289 | -1.106820 | -1.193801 | 1.766450 | -1.503002 | -0.103907 | 1.464049 |
1 | -1.505790 | -1.470011 | -1.490960 | -1.477822 | -1.658905 | -0.124160 | 2.198301 | -1.106820 | 3.504942 | -1.899286 | -1.503002 | 1.212374 | 0.306837 |
2 | -1.285591 | -1.316556 | -1.264110 | -1.286323 | -1.479190 | -0.955812 | 2.102805 | 1.052492 | -1.193801 | 0.137234 | -0.824220 | -0.338957 | 1.050759 |
3 | -1.223978 | -1.236431 | -1.202859 | -1.215514 | -1.412737 | -0.927118 | 2.102805 | 0.188767 | 0.059197 | -0.188609 | -0.515683 | 0.494403 | 0.036320 |
4 | -1.153899 | -1.174523 | -1.112084 | -1.138360 | -1.340332 | -1.205426 | 1.625320 | 1.052492 | -1.193801 | 0.137234 | 0.307084 | 0.209494 | -0.464638 |
X = [[df_x.iloc[j, i] for i in range(df_x.shape[1])] for j in range(df_x.shape[0])]
Y = [df.iloc[i,7] for i in range(df.shape[0])]
print(X[0:2])
print(Y[0:2])
[[-1.7038183261524975, -1.7302608232721683, -1.6829499939918715, -1.7057639670236395, -1.872820057485404, -0.8403922760048991, 2.580289089276951, -1.1068195873526132, -1.1938008487946932, 1.7664503220919647, -1.5030018624207688, -0.1039065146895656, 1.464048694140032], [-1.5057895100199, -1.470010607913253, -1.4909599654415322, -1.4778224513157197, -1.6589052190920777, -0.12416025602251, 2.198301444710552, -1.1068195873526132, 3.504942032500697, -1.8992863327148637, -1.5030018624207688, 1.2123738405060045, 0.3068368015117597]] [-1.738791872212694, -1.6444381302921762]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)
train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)
train_X = train_X.reshape(train_X.shape[0],1,13,1)
test_X = test_X.reshape(test_X.shape[0],1,13,1)
print(len(train_X))
print(len(test_X))
107 46
# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError
model = tf.keras.Sequential()
# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None,13, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
#model.add(Dense(5, kernel_regularizer=L2(0.01)))
# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))
#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])
history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)
Epoch 1/40 3/3 [==============================] - 14s 1s/step - loss: 1.1132 - mse: 1.1132 - mae: 0.7954 - val_loss: 0.6801 - val_mse: 0.6801 - val_mae: 0.7047 Epoch 2/40 3/3 [==============================] - 0s 47ms/step - loss: 1.0145 - mse: 1.0145 - mae: 0.7659 - val_loss: 0.5992 - val_mse: 0.5992 - val_mae: 0.6581 Epoch 3/40 3/3 [==============================] - 0s 49ms/step - loss: 0.8404 - mse: 0.8404 - mae: 0.7018 - val_loss: 0.3991 - val_mse: 0.3991 - val_mae: 0.5249 Epoch 4/40 3/3 [==============================] - 0s 49ms/step - loss: 0.4299 - mse: 0.4299 - mae: 0.5080 - val_loss: 0.0980 - val_mse: 0.0980 - val_mae: 0.2482 Epoch 5/40 3/3 [==============================] - 0s 49ms/step - loss: 0.1155 - mse: 0.1155 - mae: 0.2599 - val_loss: 0.3027 - val_mse: 0.3027 - val_mae: 0.4706 Epoch 6/40 3/3 [==============================] - 0s 46ms/step - loss: 0.2489 - mse: 0.2489 - mae: 0.4039 - val_loss: 0.1436 - val_mse: 0.1436 - val_mae: 0.3104 Epoch 7/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0943 - mse: 0.0943 - mae: 0.2432 - val_loss: 0.0345 - val_mse: 0.0345 - val_mae: 0.1563 Epoch 8/40 3/3 [==============================] - 0s 54ms/step - loss: 0.0843 - mse: 0.0843 - mae: 0.2124 - val_loss: 0.0730 - val_mse: 0.0730 - val_mae: 0.2204 Epoch 9/40 3/3 [==============================] - 0s 43ms/step - loss: 0.1079 - mse: 0.1079 - mae: 0.2609 - val_loss: 0.0979 - val_mse: 0.0979 - val_mae: 0.2547 Epoch 10/40 3/3 [==============================] - 0s 50ms/step - loss: 0.0937 - mse: 0.0937 - mae: 0.2384 - val_loss: 0.0860 - val_mse: 0.0860 - val_mae: 0.2278 Epoch 11/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0645 - mse: 0.0645 - mae: 0.2031 - val_loss: 0.0639 - val_mse: 0.0639 - val_mae: 0.1807 Epoch 12/40 3/3 [==============================] - 0s 50ms/step - loss: 0.0624 - mse: 0.0624 - mae: 0.1883 - val_loss: 0.0604 - val_mse: 0.0604 - val_mae: 0.1955 Epoch 13/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0650 - mse: 0.0650 - mae: 0.1957 - val_loss: 0.0530 - val_mse: 0.0530 - val_mae: 0.1897 Epoch 14/40 3/3 [==============================] - 0s 58ms/step - loss: 0.0640 - mse: 0.0640 - mae: 0.1997 - val_loss: 0.0385 - val_mse: 0.0385 - val_mae: 0.1571 Epoch 15/40 3/3 [==============================] - 0s 47ms/step - loss: 0.0470 - mse: 0.0470 - mae: 0.1674 - val_loss: 0.0439 - val_mse: 0.0439 - val_mae: 0.1619 Epoch 16/40 3/3 [==============================] - 0s 49ms/step - loss: 0.0499 - mse: 0.0499 - mae: 0.1750 - val_loss: 0.0491 - val_mse: 0.0491 - val_mae: 0.1778 Epoch 17/40 3/3 [==============================] - 0s 52ms/step - loss: 0.0638 - mse: 0.0638 - mae: 0.1926 - val_loss: 0.0398 - val_mse: 0.0398 - val_mae: 0.1613 Epoch 18/40 3/3 [==============================] - 0s 53ms/step - loss: 0.0652 - mse: 0.0652 - mae: 0.1923 - val_loss: 0.0322 - val_mse: 0.0322 - val_mae: 0.1411 Epoch 19/40 3/3 [==============================] - 0s 57ms/step - loss: 0.0420 - mse: 0.0420 - mae: 0.1581 - val_loss: 0.0354 - val_mse: 0.0354 - val_mae: 0.1437 Epoch 20/40 3/3 [==============================] - 0s 53ms/step - loss: 0.0519 - mse: 0.0519 - mae: 0.1740 - val_loss: 0.0377 - val_mse: 0.0377 - val_mae: 0.1469 Epoch 21/40 3/3 [==============================] - 0s 57ms/step - loss: 0.0496 - mse: 0.0496 - mae: 0.1688 - val_loss: 0.0380 - val_mse: 0.0380 - val_mae: 0.1489 Epoch 22/40 3/3 [==============================] - 0s 49ms/step - loss: 0.0452 - mse: 0.0452 - mae: 0.1637 - val_loss: 0.0366 - val_mse: 0.0366 - val_mae: 0.1491 Epoch 23/40 3/3 [==============================] - 0s 49ms/step - loss: 0.0508 - mse: 0.0508 - mae: 0.1720 - val_loss: 0.0406 - val_mse: 0.0406 - val_mae: 0.1534 Epoch 24/40 3/3 [==============================] - 0s 49ms/step - loss: 0.0493 - mse: 0.0493 - mae: 0.1681 - val_loss: 0.0477 - val_mse: 0.0477 - val_mae: 0.1597 Epoch 25/40 3/3 [==============================] - 0s 49ms/step - loss: 0.0461 - mse: 0.0461 - mae: 0.1680 - val_loss: 0.0416 - val_mse: 0.0416 - val_mae: 0.1569 Epoch 26/40 3/3 [==============================] - 0s 45ms/step - loss: 0.0503 - mse: 0.0503 - mae: 0.1728 - val_loss: 0.0438 - val_mse: 0.0438 - val_mae: 0.1581 Epoch 27/40 3/3 [==============================] - 0s 53ms/step - loss: 0.0651 - mse: 0.0651 - mae: 0.1787 - val_loss: 0.0394 - val_mse: 0.0394 - val_mae: 0.1576 Epoch 28/40 3/3 [==============================] - 0s 50ms/step - loss: 0.0557 - mse: 0.0557 - mae: 0.1833 - val_loss: 0.0397 - val_mse: 0.0397 - val_mae: 0.1562 Epoch 29/40 3/3 [==============================] - 0s 63ms/step - loss: 0.0469 - mse: 0.0469 - mae: 0.1637 - val_loss: 0.0443 - val_mse: 0.0443 - val_mae: 0.1593 Epoch 30/40 3/3 [==============================] - 0s 59ms/step - loss: 0.0419 - mse: 0.0419 - mae: 0.1584 - val_loss: 0.0377 - val_mse: 0.0377 - val_mae: 0.1574 Epoch 31/40 3/3 [==============================] - 0s 54ms/step - loss: 0.0513 - mse: 0.0513 - mae: 0.1747 - val_loss: 0.0392 - val_mse: 0.0392 - val_mae: 0.1544 Epoch 32/40 3/3 [==============================] - 0s 54ms/step - loss: 0.0414 - mse: 0.0414 - mae: 0.1523 - val_loss: 0.0548 - val_mse: 0.0548 - val_mae: 0.1641 Epoch 33/40 3/3 [==============================] - 0s 53ms/step - loss: 0.0598 - mse: 0.0598 - mae: 0.1620 - val_loss: 0.0430 - val_mse: 0.0430 - val_mae: 0.1545 Epoch 34/40 3/3 [==============================] - 0s 55ms/step - loss: 0.0423 - mse: 0.0423 - mae: 0.1586 - val_loss: 0.0377 - val_mse: 0.0377 - val_mae: 0.1541 Epoch 35/40 3/3 [==============================] - 0s 54ms/step - loss: 0.0558 - mse: 0.0558 - mae: 0.1727 - val_loss: 0.0396 - val_mse: 0.0396 - val_mae: 0.1559 Epoch 36/40 3/3 [==============================] - 0s 61ms/step - loss: 0.0518 - mse: 0.0518 - mae: 0.1639 - val_loss: 0.0411 - val_mse: 0.0411 - val_mae: 0.1554 Epoch 37/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0511 - mse: 0.0511 - mae: 0.1643 - val_loss: 0.0382 - val_mse: 0.0382 - val_mae: 0.1548 Epoch 38/40 3/3 [==============================] - 0s 52ms/step - loss: 0.0453 - mse: 0.0453 - mae: 0.1530 - val_loss: 0.0372 - val_mse: 0.0372 - val_mae: 0.1545 Epoch 39/40 3/3 [==============================] - 0s 55ms/step - loss: 0.0413 - mse: 0.0413 - mae: 0.1563 - val_loss: 0.0433 - val_mse: 0.0433 - val_mae: 0.1608 Epoch 40/40 3/3 [==============================] - 0s 56ms/step - loss: 0.0496 - mse: 0.0496 - mae: 0.1716 - val_loss: 0.0464 - val_mse: 0.0464 - val_mae: 0.1663
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc354e366b0>
model.evaluate(test_X, test_Y)
2/2 [==============================] - 0s 11ms/step - loss: 0.0464 - mse: 0.0464 - mae: 0.1663
[0.046359434723854065, 0.046359434723854065, 0.1662808209657669]
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)
r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)
var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)
Variance: 0.938245 R2 Score: 0.938245 Max Error: 0.774743
predicted = model.predict(test_X)
test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
temp = df.iloc[j,3]
test_label[j - len_t] = test_label[j - len_t] * temp + temp
predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()
2/2 [==============================] - 0s 18ms/step
I will cluster the data points using K-Means and add that cluster information to the original data and run Linear Regression and CNN+LSTM to see if it will improve their performance. I need to reduce the dimension of the data using PCA.
df.corr().round(3)
<ipython-input-87-8faa7a8256e3>:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. df.corr().round(3)
open | high | low | close | adjclose | volume | twoweeks | month | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
open | 1.000 | 0.999 | 0.999 | 0.998 | 0.986 | -0.051 | 0.983 | 0.954 | -0.221 | 0.241 | -0.043 | -0.148 | 0.009 | -0.075 | 0.058 |
high | 0.999 | 1.000 | 0.998 | 0.999 | 0.988 | -0.040 | 0.982 | 0.954 | -0.227 | 0.246 | -0.038 | -0.156 | 0.010 | -0.074 | 0.056 |
low | 0.999 | 0.998 | 1.000 | 0.999 | 0.985 | -0.075 | 0.978 | 0.947 | -0.210 | 0.238 | -0.045 | -0.145 | 0.008 | -0.079 | 0.062 |
close | 0.998 | 0.999 | 0.999 | 1.000 | 0.988 | -0.059 | 0.978 | 0.949 | -0.218 | 0.245 | -0.041 | -0.152 | 0.009 | -0.077 | 0.059 |
adjclose | 0.986 | 0.988 | 0.985 | 0.988 | 1.000 | -0.042 | 0.970 | 0.945 | -0.321 | 0.267 | -0.013 | -0.192 | 0.011 | -0.072 | 0.053 |
volume | -0.051 | -0.040 | -0.075 | -0.059 | -0.042 | 1.000 | 0.018 | 0.058 | -0.057 | 0.048 | -0.005 | -0.032 | 0.014 | 0.090 | -0.092 |
twoweeks | 0.983 | 0.982 | 0.978 | 0.978 | 0.970 | 0.018 | 1.000 | 0.986 | -0.266 | 0.248 | -0.035 | -0.160 | 0.006 | -0.077 | 0.062 |
month | 0.954 | 0.954 | 0.947 | 0.949 | 0.945 | 0.058 | 0.986 | 1.000 | -0.311 | 0.229 | -0.034 | -0.146 | -0.003 | -0.074 | 0.068 |
trends | -0.221 | -0.227 | -0.210 | -0.218 | -0.321 | -0.057 | -0.266 | -0.311 | 1.000 | -0.227 | -0.072 | 0.227 | -0.015 | 0.006 | 0.008 |
news_pos | 0.241 | 0.246 | 0.238 | 0.245 | 0.267 | 0.048 | 0.248 | 0.229 | -0.227 | 1.000 | -0.151 | -0.637 | -0.074 | 0.002 | 0.066 |
news_neg | -0.043 | -0.038 | -0.045 | -0.041 | -0.013 | -0.005 | -0.035 | -0.034 | -0.072 | -0.151 | 1.000 | -0.666 | -0.004 | -0.005 | 0.008 |
news_neu | -0.148 | -0.156 | -0.145 | -0.152 | -0.192 | -0.032 | -0.160 | -0.146 | 0.227 | -0.637 | -0.666 | 1.000 | 0.059 | 0.003 | -0.056 |
tweet_pos | 0.009 | 0.010 | 0.008 | 0.009 | 0.011 | 0.014 | 0.006 | -0.003 | -0.015 | -0.074 | -0.004 | 0.059 | 1.000 | -0.378 | -0.581 |
tweet_neg | -0.075 | -0.074 | -0.079 | -0.077 | -0.072 | 0.090 | -0.077 | -0.074 | 0.006 | 0.002 | -0.005 | 0.003 | -0.378 | 1.000 | -0.534 |
tweet_neu | 0.058 | 0.056 | 0.062 | 0.059 | 0.053 | -0.092 | 0.062 | 0.068 | 0.008 | 0.066 | 0.008 | -0.056 | -0.581 | -0.534 | 1.000 |
import pandas as pd
df_final = pd.read_csv('df_final_home_depot_scaled.csv')
df_final.head()
Unnamed: 0 | date | open | high | low | close | adjclose | volume | twoweeks | month | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2020-07-12 | -1.703818 | -1.730261 | -1.682950 | -1.705764 | -1.872820 | -0.840392 | -1.738792 | -1.736071 | 2.580289 | -1.106820 | -1.193801 | 1.766450 | -1.503002 | -0.103907 | 1.464049 |
1 | 1 | 2020-07-19 | -1.505790 | -1.470011 | -1.490960 | -1.477822 | -1.658905 | -0.124160 | -1.644438 | -1.692888 | 2.198301 | -1.106820 | 3.504942 | -1.899286 | -1.503002 | 1.212374 | 0.306837 |
2 | 2 | 2020-07-26 | -1.285591 | -1.316556 | -1.264110 | -1.286323 | -1.479190 | -0.955812 | -1.464655 | -1.600581 | 2.102805 | 1.052492 | -1.193801 | 0.137234 | -0.824220 | -0.338957 | 1.050759 |
3 | 3 | 2020-08-02 | -1.223978 | -1.236431 | -1.202859 | -1.215514 | -1.412737 | -0.927118 | -1.282168 | -1.461636 | 2.102805 | 0.188767 | 0.059197 | -0.188609 | -0.515683 | 0.494403 | 0.036320 |
4 | 4 | 2020-08-09 | -1.153899 | -1.174523 | -1.112084 | -1.138360 | -1.340332 | -1.205426 | -1.199974 | -1.330150 | 1.625320 | 1.052492 | -1.193801 | 0.137234 | 0.307084 | 0.209494 | -0.464638 |
df_final.columns
Index(['Unnamed: 0', 'date', 'open', 'high', 'low', 'close', 'adjclose', 'volume', 'twoweeks', 'month', 'trends', 'news_pos', 'news_neg', 'news_neu', 'tweet_pos', 'tweet_neg', 'tweet_neu'], dtype='object')
df_values = df_final[['open', 'high', 'low', 'close', 'adjclose',
'volume', 'trends', 'news_pos', 'news_neg',
'news_neu', 'tweet_pos', 'tweet_neg', 'tweet_neu']]
df_values.head()
open | high | low | close | adjclose | volume | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.703818 | -1.730261 | -1.682950 | -1.705764 | -1.872820 | -0.840392 | 2.580289 | -1.106820 | -1.193801 | 1.766450 | -1.503002 | -0.103907 | 1.464049 |
1 | -1.505790 | -1.470011 | -1.490960 | -1.477822 | -1.658905 | -0.124160 | 2.198301 | -1.106820 | 3.504942 | -1.899286 | -1.503002 | 1.212374 | 0.306837 |
2 | -1.285591 | -1.316556 | -1.264110 | -1.286323 | -1.479190 | -0.955812 | 2.102805 | 1.052492 | -1.193801 | 0.137234 | -0.824220 | -0.338957 | 1.050759 |
3 | -1.223978 | -1.236431 | -1.202859 | -1.215514 | -1.412737 | -0.927118 | 2.102805 | 0.188767 | 0.059197 | -0.188609 | -0.515683 | 0.494403 | 0.036320 |
4 | -1.153899 | -1.174523 | -1.112084 | -1.138360 | -1.340332 | -1.205426 | 1.625320 | 1.052492 | -1.193801 | 0.137234 | 0.307084 | 0.209494 | -0.464638 |
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
reduced_df = pd.DataFrame(pca.fit_transform(df_values), columns=(["c1","c2", "c3"]))
reduced_df.head()
c1 | c2 | c3 | |
---|---|---|---|
0 | -4.406186 | -1.618010 | -2.783632 |
1 | -3.608144 | 3.040917 | -0.156988 |
2 | -2.935629 | 0.057277 | -1.898846 |
3 | -2.966506 | 0.316674 | -0.513015 |
4 | -2.639892 | -0.374056 | -0.027751 |
import sklearn
from sklearn.cluster import KMeans
distances = []
k_values = range(2,10)
for cluster in k_values:
kmeans = KMeans(n_clusters=cluster)
kmeans.fit(reduced_df)
distances.append(kmeans.inertia_)
plt.plot(k_values, distances,'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of Distances')
plt.title('Elbow Method For Optimal Number of Clusters')
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn(
from yellowbrick.cluster import KElbowVisualizer
auto_elbow = KElbowVisualizer(KMeans(), k=10)
auto_elbow.fit(df_values)
auto_elbow.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn(
<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
from sklearn.cluster import KMeans
km=KMeans(n_clusters=5)
clusters= km.fit_predict(reduced_df)
pca_clust = reduced_df.copy()
pca_clust['clusters']=clusters
pca_clust.head()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn(
c1 | c2 | c3 | clusters | |
---|---|---|---|---|
0 | -4.406186 | -1.618010 | -2.783632 | 2 |
1 | -3.608144 | 3.040917 | -0.156988 | 4 |
2 | -2.935629 | 0.057277 | -1.898846 | 4 |
3 | -2.966506 | 0.316674 | -0.513015 | 4 |
4 | -2.639892 | -0.374056 | -0.027751 | 2 |
df_values['cluster'] = clusters
df_values.head()
<ipython-input-95-7dac953dbd69>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_values['cluster'] = clusters
open | high | low | close | adjclose | volume | trends | news_pos | news_neg | news_neu | tweet_pos | tweet_neg | tweet_neu | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.703818 | -1.730261 | -1.682950 | -1.705764 | -1.872820 | -0.840392 | 2.580289 | -1.106820 | -1.193801 | 1.766450 | -1.503002 | -0.103907 | 1.464049 | 2 |
1 | -1.505790 | -1.470011 | -1.490960 | -1.477822 | -1.658905 | -0.124160 | 2.198301 | -1.106820 | 3.504942 | -1.899286 | -1.503002 | 1.212374 | 0.306837 | 4 |
2 | -1.285591 | -1.316556 | -1.264110 | -1.286323 | -1.479190 | -0.955812 | 2.102805 | 1.052492 | -1.193801 | 0.137234 | -0.824220 | -0.338957 | 1.050759 | 4 |
3 | -1.223978 | -1.236431 | -1.202859 | -1.215514 | -1.412737 | -0.927118 | 2.102805 | 0.188767 | 0.059197 | -0.188609 | -0.515683 | 0.494403 | 0.036320 | 4 |
4 | -1.153899 | -1.174523 | -1.112084 | -1.138360 | -1.340332 | -1.205426 | 1.625320 | 1.052492 | -1.193801 | 0.137234 | 0.307084 | 0.209494 | -0.464638 | 2 |
df_x = df_values[['open', 'high', 'low', 'close', 'adjclose','volume', 'cluster']]
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
x_train, x_test, y_train, y_test = train_test_split(df_x, df_final['twoweeks'], test_size=0.3, random_state=0)
regression = LinearRegression()
regression.fit(x_train, y_train)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)
regression coefficient [ 2.80045428 -0.37223382 -1.53280191 0.06410424 0.02119985 0.03065633 -0.00540246] regression intercept -0.012097080760859524
regression_confidence = regression.score(x_test, y_test)
print("linear regression confidence: ", regression_confidence)
linear regression confidence: 0.9476930471292958
predicted=regression.predict(x_test)
print(x_test.head())
open high low close adjclose volume cluster 26 -1.222595 -1.220040 -1.189692 -1.194005 -1.314978 0.075531 4 135 0.423421 0.383837 0.416818 0.388613 0.595698 -0.751431 0 63 0.783028 0.758650 0.821157 0.777318 0.707787 -0.779556 1 105 -0.615611 -0.554651 -0.594871 -0.573033 -0.461688 -0.918600 4 24 -1.103859 -1.135303 -1.064246 -1.082658 -1.209359 -0.907303 2
dfr=pd.DataFrame({'Actual_Price':y_test, 'Predicted_Price':predicted}).reset_index()
dfr.drop(columns='index', inplace=True, axis=1)
dfr.head(10)
Actual_Price | Predicted_Price | |
---|---|---|
0 | -1.159801 | -1.281929 |
1 | 0.471989 | 0.406400 |
2 | 0.753759 | 0.675203 |
3 | -0.716651 | -0.714101 |
4 | -1.131320 | -1.183188 |
5 | -0.671596 | -0.561560 |
6 | 0.666571 | 0.697591 |
7 | -0.405871 | -0.754095 |
8 | -0.043019 | -0.375646 |
9 | 0.400488 | 0.381176 |
from sklearn import metrics
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(y_test, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, predicted)))
Mean Absolute Error (MAE): 0.1337945393494824 Mean Squared Error (MSE) : 0.031248325149281247 Root Mean Squared Error (RMSE): 0.17677195803995963
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price, color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Predictions using Historical")
plt.legend()
<matplotlib.legend.Legend at 0x7fc34d99c520>
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
x_train, x_test, y_train, y_test = train_test_split(df_values, df_final['twoweeks'], test_size=0.3, random_state=0)
regression = LinearRegression()
regression.fit(x_train, y_train)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)
regression coefficient [ 2.66166755e+00 -3.53252621e-01 -1.39810203e+00 1.48573349e-01 -8.49023102e-02 3.15909420e-02 -2.99197954e-02 -1.89951390e-03 -6.61002963e-03 6.59002912e-03 -1.43476246e-03 8.87248181e-04 5.30348639e-04 -2.67182404e-03] regression intercept -0.018002014019515143
regression_confidence = regression.score(x_test, y_test)
print("linear regression confidence: ", regression_confidence)
linear regression confidence: 0.9493158696671368
predicted=regression.predict(x_test)
print(x_test.head())
open high low close adjclose volume trends \ 26 -1.222595 -1.220040 -1.189692 -1.194005 -1.314978 0.075531 0.097369 135 0.423421 0.383837 0.416818 0.388613 0.595698 -0.751431 -1.812569 63 0.783028 0.758650 0.821157 0.777318 0.707787 -0.779556 -0.571109 105 -0.615611 -0.554651 -0.594871 -0.573033 -0.461688 -0.918600 -0.189121 24 -1.103859 -1.135303 -1.064246 -1.082658 -1.209359 -0.907303 0.192866 news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu cluster 26 0.512664 0.372447 -0.677374 -1.503002 0.366194 1.050759 4 135 -1.106820 3.818192 -2.143669 2.117169 -1.044107 -1.015691 0 63 0.188767 0.059197 -0.188609 2.569690 -1.044107 -1.428981 1 105 1.052492 0.894529 -1.491982 0.126075 0.084134 -0.189111 4 24 -1.106820 -1.193801 1.766450 0.824251 0.567665 -1.251857 2
dfr=pd.DataFrame({'Actual_Price':y_test, 'Predicted_Price':predicted}).reset_index()
dfr.drop(columns='index', inplace=True, axis=1)
dfr.head(10)
Actual_Price | Predicted_Price | |
---|---|---|
0 | -1.159801 | -1.259679 |
1 | 0.471989 | 0.386546 |
2 | 0.753759 | 0.687921 |
3 | -0.716651 | -0.726870 |
4 | -1.131320 | -1.144796 |
5 | -0.671596 | -0.554696 |
6 | 0.666571 | 0.657503 |
7 | -0.405871 | -0.765469 |
8 | -0.043019 | -0.374303 |
9 | 0.400488 | 0.389789 |
from sklearn import metrics
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(y_test, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, predicted)))
Mean Absolute Error (MAE): 0.12894628403303512 Mean Squared Error (MSE) : 0.03027884626475542 Root Mean Squared Error (RMSE): 0.1740081787294937
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price, color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Predictions using Historical, Sentiment and Trends")
plt.legend()
<matplotlib.legend.Legend at 0x7fc34cef49a0>
df_x.head()
open | high | low | close | adjclose | volume | cluster | |
---|---|---|---|---|---|---|---|
0 | -1.703818 | -1.730261 | -1.682950 | -1.705764 | -1.872820 | -0.840392 | 2 |
1 | -1.505790 | -1.470011 | -1.490960 | -1.477822 | -1.658905 | -0.124160 | 4 |
2 | -1.285591 | -1.316556 | -1.264110 | -1.286323 | -1.479190 | -0.955812 | 4 |
3 | -1.223978 | -1.236431 | -1.202859 | -1.215514 | -1.412737 | -0.927118 | 4 |
4 | -1.153899 | -1.174523 | -1.112084 | -1.138360 | -1.340332 | -1.205426 | 2 |
X = [[df_x.iloc[j, i] for i in range(df_x.shape[1])] for j in range(df_x.shape[0])]
Y = [df_final.iloc[i,8] for i in range(df_final.shape[0])]
print(X[0:2])
print(Y[0:2])
[[-1.7038183261524975, -1.7302608232721683, -1.6829499939918715, -1.7057639670236395, -1.872820057485404, -0.8403922760048991, 2], [-1.5057895100199, -1.470010607913253, -1.4909599654415322, -1.4778224513157197, -1.6589052190920777, -0.12416025602251, 4]] [-1.738791872212694, -1.6444381302921762]
from sklearn.model_selection import train_test_split
import numpy as np
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)
train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)
train_X = train_X.reshape(train_X.shape[0],1,7,1)
test_X = test_X.reshape(test_X.shape[0],1,7,1)
print(len(train_X))
print(len(test_X))
107 46
# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError
model = tf.keras.Sequential()
# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None,7, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
#model.add(Dense(5, kernel_regularizer=L2(0.01)))
# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))
#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])
history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)
Epoch 1/40 3/3 [==============================] - 17s 2s/step - loss: 0.8435 - mse: 0.8435 - mae: 0.7191 - val_loss: 1.2994 - val_mse: 1.2994 - val_mae: 0.8909 Epoch 2/40 3/3 [==============================] - 0s 41ms/step - loss: 0.7868 - mse: 0.7868 - mae: 0.6972 - val_loss: 1.1748 - val_mse: 1.1748 - val_mae: 0.8485 Epoch 3/40 3/3 [==============================] - 0s 47ms/step - loss: 0.6732 - mse: 0.6732 - mae: 0.6493 - val_loss: 0.8909 - val_mse: 0.8909 - val_mae: 0.7464 Epoch 4/40 3/3 [==============================] - 0s 42ms/step - loss: 0.4508 - mse: 0.4508 - mae: 0.5399 - val_loss: 0.4124 - val_mse: 0.4124 - val_mae: 0.5224 Epoch 5/40 3/3 [==============================] - 0s 49ms/step - loss: 0.1894 - mse: 0.1894 - mae: 0.3295 - val_loss: 0.0471 - val_mse: 0.0471 - val_mae: 0.1709 Epoch 6/40 3/3 [==============================] - 0s 44ms/step - loss: 0.0823 - mse: 0.0823 - mae: 0.2286 - val_loss: 0.1999 - val_mse: 0.1999 - val_mae: 0.3940 Epoch 7/40 3/3 [==============================] - 0s 41ms/step - loss: 0.1960 - mse: 0.1960 - mae: 0.3494 - val_loss: 0.0906 - val_mse: 0.0906 - val_mae: 0.2370 Epoch 8/40 3/3 [==============================] - 0s 47ms/step - loss: 0.0836 - mse: 0.0836 - mae: 0.2222 - val_loss: 0.0874 - val_mse: 0.0874 - val_mae: 0.2231 Epoch 9/40 3/3 [==============================] - 0s 44ms/step - loss: 0.0851 - mse: 0.0851 - mae: 0.2084 - val_loss: 0.1338 - val_mse: 0.1338 - val_mae: 0.2902 Epoch 10/40 3/3 [==============================] - 0s 46ms/step - loss: 0.0925 - mse: 0.0925 - mae: 0.2278 - val_loss: 0.0918 - val_mse: 0.0918 - val_mae: 0.2423 Epoch 11/40 3/3 [==============================] - 0s 60ms/step - loss: 0.0644 - mse: 0.0644 - mae: 0.1962 - val_loss: 0.0461 - val_mse: 0.0461 - val_mae: 0.1661 Epoch 12/40 3/3 [==============================] - 0s 68ms/step - loss: 0.0565 - mse: 0.0565 - mae: 0.1744 - val_loss: 0.0339 - val_mse: 0.0339 - val_mae: 0.1346 Epoch 13/40 3/3 [==============================] - 0s 63ms/step - loss: 0.0756 - mse: 0.0756 - mae: 0.1995 - val_loss: 0.0467 - val_mse: 0.0467 - val_mae: 0.1664 Epoch 14/40 3/3 [==============================] - 0s 65ms/step - loss: 0.0591 - mse: 0.0591 - mae: 0.1964 - val_loss: 0.0582 - val_mse: 0.0582 - val_mae: 0.1929 Epoch 15/40 3/3 [==============================] - 0s 75ms/step - loss: 0.0614 - mse: 0.0614 - mae: 0.1892 - val_loss: 0.0528 - val_mse: 0.0528 - val_mae: 0.1754 Epoch 16/40 3/3 [==============================] - 0s 69ms/step - loss: 0.0429 - mse: 0.0429 - mae: 0.1579 - val_loss: 0.0513 - val_mse: 0.0513 - val_mae: 0.1761 Epoch 17/40 3/3 [==============================] - 0s 76ms/step - loss: 0.0553 - mse: 0.0553 - mae: 0.1851 - val_loss: 0.0463 - val_mse: 0.0463 - val_mae: 0.1687 Epoch 18/40 3/3 [==============================] - 0s 75ms/step - loss: 0.0602 - mse: 0.0602 - mae: 0.1794 - val_loss: 0.0393 - val_mse: 0.0393 - val_mae: 0.1505 Epoch 19/40 3/3 [==============================] - 0s 71ms/step - loss: 0.0561 - mse: 0.0561 - mae: 0.1799 - val_loss: 0.0444 - val_mse: 0.0444 - val_mae: 0.1655 Epoch 20/40 3/3 [==============================] - 0s 66ms/step - loss: 0.0568 - mse: 0.0568 - mae: 0.1719 - val_loss: 0.0313 - val_mse: 0.0313 - val_mae: 0.1319 Epoch 21/40 3/3 [==============================] - 0s 67ms/step - loss: 0.0393 - mse: 0.0393 - mae: 0.1526 - val_loss: 0.0319 - val_mse: 0.0319 - val_mae: 0.1335 Epoch 22/40 3/3 [==============================] - 0s 64ms/step - loss: 0.0532 - mse: 0.0532 - mae: 0.1732 - val_loss: 0.0297 - val_mse: 0.0297 - val_mae: 0.1321 Epoch 23/40 3/3 [==============================] - 0s 66ms/step - loss: 0.0488 - mse: 0.0488 - mae: 0.1769 - val_loss: 0.0392 - val_mse: 0.0392 - val_mae: 0.1575 Epoch 24/40 3/3 [==============================] - 0s 70ms/step - loss: 0.0447 - mse: 0.0447 - mae: 0.1678 - val_loss: 0.0433 - val_mse: 0.0433 - val_mae: 0.1657 Epoch 25/40 3/3 [==============================] - 0s 68ms/step - loss: 0.0409 - mse: 0.0409 - mae: 0.1538 - val_loss: 0.0338 - val_mse: 0.0338 - val_mae: 0.1421 Epoch 26/40 3/3 [==============================] - 0s 71ms/step - loss: 0.0364 - mse: 0.0364 - mae: 0.1565 - val_loss: 0.0295 - val_mse: 0.0295 - val_mae: 0.1335 Epoch 27/40 3/3 [==============================] - 0s 42ms/step - loss: 0.0438 - mse: 0.0438 - mae: 0.1651 - val_loss: 0.0310 - val_mse: 0.0310 - val_mae: 0.1321 Epoch 28/40 3/3 [==============================] - 0s 41ms/step - loss: 0.0585 - mse: 0.0585 - mae: 0.1756 - val_loss: 0.0451 - val_mse: 0.0451 - val_mae: 0.1623 Epoch 29/40 3/3 [==============================] - 0s 40ms/step - loss: 0.0581 - mse: 0.0581 - mae: 0.1712 - val_loss: 0.0460 - val_mse: 0.0460 - val_mae: 0.1629 Epoch 30/40 3/3 [==============================] - 0s 52ms/step - loss: 0.0503 - mse: 0.0503 - mae: 0.1695 - val_loss: 0.0344 - val_mse: 0.0344 - val_mae: 0.1425 Epoch 31/40 3/3 [==============================] - 0s 53ms/step - loss: 0.0552 - mse: 0.0552 - mae: 0.1676 - val_loss: 0.0413 - val_mse: 0.0413 - val_mae: 0.1576 Epoch 32/40 3/3 [==============================] - 0s 45ms/step - loss: 0.0430 - mse: 0.0430 - mae: 0.1617 - val_loss: 0.0427 - val_mse: 0.0427 - val_mae: 0.1586 Epoch 33/40 3/3 [==============================] - 0s 45ms/step - loss: 0.0445 - mse: 0.0445 - mae: 0.1510 - val_loss: 0.0305 - val_mse: 0.0305 - val_mae: 0.1326 Epoch 34/40 3/3 [==============================] - 0s 44ms/step - loss: 0.0552 - mse: 0.0552 - mae: 0.1731 - val_loss: 0.0424 - val_mse: 0.0424 - val_mae: 0.1598 Epoch 35/40 3/3 [==============================] - 0s 45ms/step - loss: 0.0584 - mse: 0.0584 - mae: 0.1776 - val_loss: 0.0609 - val_mse: 0.0609 - val_mae: 0.1907 Epoch 36/40 3/3 [==============================] - 0s 44ms/step - loss: 0.0492 - mse: 0.0492 - mae: 0.1695 - val_loss: 0.0427 - val_mse: 0.0427 - val_mae: 0.1609 Epoch 37/40 3/3 [==============================] - 0s 45ms/step - loss: 0.0339 - mse: 0.0339 - mae: 0.1425 - val_loss: 0.0296 - val_mse: 0.0296 - val_mae: 0.1355 Epoch 38/40 3/3 [==============================] - 0s 54ms/step - loss: 0.0438 - mse: 0.0438 - mae: 0.1539 - val_loss: 0.0316 - val_mse: 0.0316 - val_mae: 0.1407 Epoch 39/40 3/3 [==============================] - 0s 49ms/step - loss: 0.0487 - mse: 0.0487 - mae: 0.1662 - val_loss: 0.0441 - val_mse: 0.0441 - val_mae: 0.1637 Epoch 40/40 3/3 [==============================] - 0s 45ms/step - loss: 0.0563 - mse: 0.0563 - mae: 0.1701 - val_loss: 0.0616 - val_mse: 0.0616 - val_mae: 0.1905
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc354999cf0>
model.evaluate(test_X, test_Y)
2/2 [==============================] - 0s 12ms/step - loss: 0.0616 - mse: 0.0616 - mae: 0.1905
[0.061608511954545975, 0.061608511954545975, 0.1905282735824585]
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)
r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)
var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)
Variance: 0.964044 R2 Score: 0.964044 Max Error: 0.661186
predicted = model.predict(test_X)
test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
temp = df_final.iloc[j,3]
test_label[j - len_t] = test_label[j - len_t] * temp + temp
predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()
2/2 [==============================] - 0s 9ms/step
X = [[df_values.iloc[j, i] for i in range(df_values.shape[1])] for j in range(df_values.shape[0])]
Y = [df_final.iloc[i,8] for i in range(df_final.shape[0])]
print(X[0:2])
print(Y[0:2])
[[-1.7038183261524975, -1.7302608232721683, -1.6829499939918715, -1.7057639670236395, -1.872820057485404, -0.8403922760048991, 2.580289089276951, -1.1068195873526132, -1.1938008487946932, 1.7664503220919647, -1.5030018624207688, -0.1039065146895656, 1.464048694140032, 2], [-1.5057895100199, -1.470010607913253, -1.4909599654415322, -1.4778224513157197, -1.6589052190920777, -0.12416025602251, 2.198301444710552, -1.1068195873526132, 3.504942032500697, -1.8992863327148637, -1.5030018624207688, 1.2123738405060045, 0.3068368015117597, 4]] [-1.738791872212694, -1.6444381302921762]
from sklearn.model_selection import train_test_split
import numpy as np
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)
train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)
train_X = train_X.reshape(train_X.shape[0],1,14,1)
test_X = test_X.reshape(test_X.shape[0],1,14,1)
print(len(train_X))
print(len(test_X))
107 46
# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError
model = tf.keras.Sequential()
# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None,14, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
#model.add(Dense(5, kernel_regularizer=L2(0.01)))
# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))
#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])
history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)
Epoch 1/40 3/3 [==============================] - 13s 893ms/step - loss: 0.9283 - mse: 0.9283 - mae: 0.7439 - val_loss: 1.0816 - val_mse: 1.0816 - val_mae: 0.8186 Epoch 2/40 3/3 [==============================] - 0s 47ms/step - loss: 0.8707 - mse: 0.8707 - mae: 0.7204 - val_loss: 0.9586 - val_mse: 0.9586 - val_mae: 0.7668 Epoch 3/40 3/3 [==============================] - 0s 50ms/step - loss: 0.7502 - mse: 0.7502 - mae: 0.6681 - val_loss: 0.6875 - val_mse: 0.6875 - val_mae: 0.6335 Epoch 4/40 3/3 [==============================] - 0s 52ms/step - loss: 0.4556 - mse: 0.4556 - mae: 0.5155 - val_loss: 0.2252 - val_mse: 0.2252 - val_mae: 0.3242 Epoch 5/40 3/3 [==============================] - 0s 50ms/step - loss: 0.1291 - mse: 0.1291 - mae: 0.2721 - val_loss: 0.1801 - val_mse: 0.1801 - val_mae: 0.3321 Epoch 6/40 3/3 [==============================] - 0s 51ms/step - loss: 0.2382 - mse: 0.2382 - mae: 0.3739 - val_loss: 0.2083 - val_mse: 0.2083 - val_mae: 0.3451 Epoch 7/40 3/3 [==============================] - 0s 51ms/step - loss: 0.1393 - mse: 0.1393 - mae: 0.2724 - val_loss: 0.0689 - val_mse: 0.0689 - val_mae: 0.1855 Epoch 8/40 3/3 [==============================] - 0s 43ms/step - loss: 0.0738 - mse: 0.0738 - mae: 0.2105 - val_loss: 0.1492 - val_mse: 0.1492 - val_mae: 0.2463 Epoch 9/40 3/3 [==============================] - 0s 47ms/step - loss: 0.1093 - mse: 0.1093 - mae: 0.2484 - val_loss: 0.1612 - val_mse: 0.1612 - val_mae: 0.2658 Epoch 10/40 3/3 [==============================] - 0s 53ms/step - loss: 0.1245 - mse: 0.1245 - mae: 0.2644 - val_loss: 0.0971 - val_mse: 0.0971 - val_mae: 0.2057 Epoch 11/40 3/3 [==============================] - 0s 56ms/step - loss: 0.0750 - mse: 0.0750 - mae: 0.1969 - val_loss: 0.0586 - val_mse: 0.0586 - val_mae: 0.1809 Epoch 12/40 3/3 [==============================] - 0s 52ms/step - loss: 0.0657 - mse: 0.0657 - mae: 0.1980 - val_loss: 0.0731 - val_mse: 0.0731 - val_mae: 0.2134 Epoch 13/40 3/3 [==============================] - 0s 53ms/step - loss: 0.0723 - mse: 0.0723 - mae: 0.2060 - val_loss: 0.0743 - val_mse: 0.0743 - val_mae: 0.2120 Epoch 14/40 3/3 [==============================] - 0s 50ms/step - loss: 0.0556 - mse: 0.0556 - mae: 0.1859 - val_loss: 0.0583 - val_mse: 0.0583 - val_mae: 0.1829 Epoch 15/40 3/3 [==============================] - 0s 54ms/step - loss: 0.0518 - mse: 0.0518 - mae: 0.1744 - val_loss: 0.0575 - val_mse: 0.0575 - val_mae: 0.1691 Epoch 16/40 3/3 [==============================] - 0s 54ms/step - loss: 0.0541 - mse: 0.0541 - mae: 0.1732 - val_loss: 0.0638 - val_mse: 0.0638 - val_mae: 0.1766 Epoch 17/40 3/3 [==============================] - 0s 48ms/step - loss: 0.0530 - mse: 0.0530 - mae: 0.1773 - val_loss: 0.0641 - val_mse: 0.0641 - val_mae: 0.1815 Epoch 18/40 3/3 [==============================] - 0s 49ms/step - loss: 0.0559 - mse: 0.0559 - mae: 0.1679 - val_loss: 0.0602 - val_mse: 0.0602 - val_mae: 0.1766 Epoch 19/40 3/3 [==============================] - 0s 47ms/step - loss: 0.0456 - mse: 0.0456 - mae: 0.1608 - val_loss: 0.0615 - val_mse: 0.0615 - val_mae: 0.1804 Epoch 20/40 3/3 [==============================] - 0s 48ms/step - loss: 0.0413 - mse: 0.0413 - mae: 0.1605 - val_loss: 0.0617 - val_mse: 0.0617 - val_mae: 0.1815 Epoch 21/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0497 - mse: 0.0497 - mae: 0.1733 - val_loss: 0.0627 - val_mse: 0.0627 - val_mae: 0.1825 Epoch 22/40 3/3 [==============================] - 0s 53ms/step - loss: 0.0401 - mse: 0.0401 - mae: 0.1543 - val_loss: 0.0651 - val_mse: 0.0651 - val_mae: 0.1765 Epoch 23/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0520 - mse: 0.0520 - mae: 0.1778 - val_loss: 0.0619 - val_mse: 0.0619 - val_mae: 0.1746 Epoch 24/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0423 - mse: 0.0423 - mae: 0.1565 - val_loss: 0.0629 - val_mse: 0.0629 - val_mae: 0.1894 Epoch 25/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0293 - mse: 0.0293 - mae: 0.1376 - val_loss: 0.0655 - val_mse: 0.0655 - val_mae: 0.1943 Epoch 26/40 3/3 [==============================] - 0s 53ms/step - loss: 0.0472 - mse: 0.0472 - mae: 0.1656 - val_loss: 0.0562 - val_mse: 0.0562 - val_mae: 0.1722 Epoch 27/40 3/3 [==============================] - 0s 55ms/step - loss: 0.0426 - mse: 0.0426 - mae: 0.1676 - val_loss: 0.0572 - val_mse: 0.0572 - val_mae: 0.1703 Epoch 28/40 3/3 [==============================] - 0s 48ms/step - loss: 0.0424 - mse: 0.0424 - mae: 0.1590 - val_loss: 0.0587 - val_mse: 0.0587 - val_mae: 0.1770 Epoch 29/40 3/3 [==============================] - 0s 48ms/step - loss: 0.0371 - mse: 0.0371 - mae: 0.1488 - val_loss: 0.0595 - val_mse: 0.0595 - val_mae: 0.1810 Epoch 30/40 3/3 [==============================] - 0s 53ms/step - loss: 0.0390 - mse: 0.0390 - mae: 0.1529 - val_loss: 0.0626 - val_mse: 0.0626 - val_mae: 0.1866 Epoch 31/40 3/3 [==============================] - 0s 55ms/step - loss: 0.0418 - mse: 0.0418 - mae: 0.1524 - val_loss: 0.0601 - val_mse: 0.0601 - val_mae: 0.1780 Epoch 32/40 3/3 [==============================] - 0s 49ms/step - loss: 0.0474 - mse: 0.0474 - mae: 0.1659 - val_loss: 0.0569 - val_mse: 0.0569 - val_mae: 0.1792 Epoch 33/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0389 - mse: 0.0389 - mae: 0.1597 - val_loss: 0.0588 - val_mse: 0.0588 - val_mae: 0.1899 Epoch 34/40 3/3 [==============================] - 0s 50ms/step - loss: 0.0407 - mse: 0.0407 - mae: 0.1512 - val_loss: 0.0592 - val_mse: 0.0592 - val_mae: 0.1928 Epoch 35/40 3/3 [==============================] - 0s 46ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1503 - val_loss: 0.0587 - val_mse: 0.0587 - val_mae: 0.1883 Epoch 36/40 3/3 [==============================] - 0s 42ms/step - loss: 0.0445 - mse: 0.0445 - mae: 0.1519 - val_loss: 0.0603 - val_mse: 0.0603 - val_mae: 0.1886 Epoch 37/40 3/3 [==============================] - 0s 49ms/step - loss: 0.0281 - mse: 0.0281 - mae: 0.1283 - val_loss: 0.0613 - val_mse: 0.0613 - val_mae: 0.1905 Epoch 38/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0380 - mse: 0.0380 - mae: 0.1487 - val_loss: 0.0623 - val_mse: 0.0623 - val_mae: 0.1884 Epoch 39/40 3/3 [==============================] - 0s 43ms/step - loss: 0.0316 - mse: 0.0316 - mae: 0.1320 - val_loss: 0.0640 - val_mse: 0.0640 - val_mae: 0.1942 Epoch 40/40 3/3 [==============================] - 0s 51ms/step - loss: 0.0405 - mse: 0.0405 - mae: 0.1428 - val_loss: 0.0701 - val_mse: 0.0701 - val_mae: 0.1984
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc34d0a2080>
model.evaluate(test_X, test_Y)
2/2 [==============================] - 0s 13ms/step - loss: 0.0701 - mse: 0.0701 - mae: 0.1984
[0.07009752839803696, 0.07009752839803696, 0.19842290878295898]
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)
r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)
var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)
Variance: 0.943367 R2 Score: 0.943367 Max Error: 0.796149
predicted = model.predict(test_X)
test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
temp = df_final.iloc[j,3]
test_label[j - len_t] = test_label[j - len_t] * temp + temp
predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()
2/2 [==============================] - 0s 12ms/step