Predicting Stock Prices of Home Depot Based on Trends and the Sentiment of News and Tweets

Betul Mescioglu
import pandas as pd
import seaborn
import matplotlib.pyplot as plt
import datetime as dt
from datetime import date
from dateutil.relativedelta import relativedelta
!pip install yahoo_fin

Stock Data of Home Depot:

from yahoo_fin import stock_info as si

def stock_data(stock_name):
    df = si.get_data(stock_name, start_date=(date.today() - relativedelta(years=3)).strftime('%m/%d/%Y'), 
                     end_date=date.today().strftime('%m/%d/%Y'))
    return df
ticker='HD'
company_name='Home Depot'
df = stock_data(ticker)
df.head()
open high low close adjclose volume ticker
2020-06-08 252.490005 256.809998 252.259995 256.769989 239.264954 3811900 HD
2020-06-09 255.330002 258.290009 253.860001 256.760010 239.255646 3716100 HD
2020-06-10 257.450012 259.290009 254.220001 254.449997 237.103104 3589800 HD
2020-06-11 248.860001 250.619995 238.740005 239.470001 223.144394 6563700 HD
2020-06-12 243.070007 246.389999 237.050003 242.449997 225.921204 5238500 HD
df.tail()
open high low close adjclose volume ticker
2023-05-31 289.589996 290.000000 281.959991 283.450012 283.450012 18288800 HD
2023-06-01 284.049988 289.220001 279.980011 288.390015 288.390015 4305100 HD
2023-06-02 290.649994 296.209991 289.720001 295.940002 295.940002 4514600 HD
2023-06-05 295.619995 295.720001 291.369995 293.100006 293.100006 3028800 HD
2023-06-06 291.820007 296.920013 291.649994 296.000000 296.000000 2854800 HD
#Add target values for two weeks and one month. 
#Dataset has values for weekdays only, so two weeks will be 10 data points
# and one month will be 20 data points
df['TwoWeeks'] = df['close'].rolling(10).mean()
df['Month'] = df['close'].rolling(20).mean()
df.head()
open high low close adjclose volume ticker TwoWeeks Month
2020-06-08 252.490005 256.809998 252.259995 256.769989 239.264938 3811900 HD NaN NaN
2020-06-09 255.330002 258.290009 253.860001 256.760010 239.255646 3716100 HD NaN NaN
2020-06-10 257.450012 259.290009 254.220001 254.449997 237.103119 3589800 HD NaN NaN
2020-06-11 248.860001 250.619995 238.740005 239.470001 223.144379 6563700 HD NaN NaN
2020-06-12 243.070007 246.389999 237.050003 242.449997 225.921188 5238500 HD NaN NaN
df.dropna(inplace=True)
df.head()
open high low close adjclose volume ticker TwoWeeks Month
2020-07-06 250.270004 251.500000 247.039993 249.550003 232.537186 3133800 HD 247.481999 248.1370
2020-07-07 247.369995 250.779999 247.070007 247.350006 230.487167 2927800 HD 247.300999 247.6660
2020-07-08 247.869995 249.789993 246.220001 249.169998 232.183075 2294000 HD 247.187000 247.2865
2020-07-09 249.660004 250.509995 246.350006 247.960007 231.055588 2994700 HD 247.370000 246.9620
2020-07-10 248.289993 250.330002 246.639999 250.110001 233.058990 2745300 HD 247.842999 247.4940
!pip install pytrends
import pytrends
from pytrends.request import TrendReq
pytrends = TrendReq()
kw_list=['Home Depot']
#Gather Google trends for ticker 'HD' and Home Depot
pytrends.build_payload(kw_list, geo='', timeframe='{} {}'.format((date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'),date.today().strftime('%Y-%m-%d')))#timeframe='2010-06-29 2023-05-16')
keyword_interest = pytrends.interest_over_time()
del keyword_interest['isPartial']
keyword_interest.columns.rename("{}".format(ticker))
keyword_interest.head()
Home Depot
date
2020-06-07 100
2020-06-14 100
2020-06-21 96
2020-06-28 97
2020-07-05 90
keyword_interest['Home Depot'].plot(title='Google Trends for \'Home Depot\' in the last three years')
<AxesSubplot: title={'center': "Google Trends for 'Home Depot' in the last three years"}>
df_combined = pd.concat([df, keyword_interest], axis=1)
#df_combined.dropna(subset='Month', inplace=True)
df_combined.head(20)
open high low close adjclose volume ticker TwoWeeks Month Home Depot
2020-06-07 NaN NaN NaN NaN NaN NaN NaN NaN NaN 100.0
2020-06-14 NaN NaN NaN NaN NaN NaN NaN NaN NaN 100.0
2020-06-21 NaN NaN NaN NaN NaN NaN NaN NaN NaN 96.0
2020-06-28 NaN NaN NaN NaN NaN NaN NaN NaN NaN 97.0
2020-07-05 NaN NaN NaN NaN NaN NaN NaN NaN NaN 90.0
2020-07-06 250.270004 251.500000 247.039993 249.550003 232.537186 3133800.0 HD 247.481999 248.137000 NaN
2020-07-07 247.369995 250.779999 247.070007 247.350006 230.487167 2927800.0 HD 247.300999 247.666000 NaN
2020-07-08 247.869995 249.789993 246.220001 249.169998 232.183075 2294000.0 HD 247.187000 247.286500 NaN
2020-07-09 249.660004 250.509995 246.350006 247.960007 231.055588 2994700.0 HD 247.370000 246.962000 NaN
2020-07-10 248.289993 250.330002 246.639999 250.110001 233.058990 2745300.0 HD 247.842999 247.494000 NaN
2020-07-12 NaN NaN NaN NaN NaN NaN NaN NaN NaN 86.0
2020-07-13 251.919998 257.859985 249.089996 249.619995 232.602386 4437500.0 HD 248.703999 247.852500 NaN
2020-07-14 249.000000 258.179993 248.460007 257.790009 240.215408 4614200.0 HD 249.871001 248.674001 NaN
2020-07-15 260.140015 261.290009 255.149994 257.799988 240.224716 4343700.0 HD 250.600000 249.066500 NaN
2020-07-16 256.760010 260.500000 256.000000 258.079987 240.485641 2511200.0 HD 251.592999 249.427999 NaN
2020-07-17 260.029999 260.649994 257.720001 260.380005 242.628845 3091300.0 HD 252.781000 249.986499 NaN
2020-07-19 NaN NaN NaN NaN NaN NaN NaN NaN NaN 85.0
2020-07-20 259.040009 261.200012 258.019989 260.170013 242.433228 2451100.0 HD 253.843001 250.662500 NaN
2020-07-21 261.619995 263.869995 260.720001 262.420013 244.529800 2439900.0 HD 255.350002 251.325500 NaN
2020-07-22 262.589996 265.589996 262.000000 265.170013 247.092316 2750000.0 HD 256.950003 252.068501 NaN

Google Trends give values for Sundays only when the data is pulled in years. I will move Sunday values to the week after since we are concerned with trends rather than daily values.

df_combined['Home Depot'] = df_combined['Home Depot'].fillna(method='ffill')
df_combined.dropna(inplace=True)
df_combined.head(30)
open high low close adjclose volume ticker TwoWeeks Month Home Depot
2020-07-06 250.270004 251.500000 247.039993 249.550003 232.537186 3133800.0 HD 247.481999 248.137000 90.0
2020-07-07 247.369995 250.779999 247.070007 247.350006 230.487167 2927800.0 HD 247.300999 247.666000 90.0
2020-07-08 247.869995 249.789993 246.220001 249.169998 232.183075 2294000.0 HD 247.187000 247.286500 90.0
2020-07-09 249.660004 250.509995 246.350006 247.960007 231.055588 2994700.0 HD 247.370000 246.962000 90.0
2020-07-10 248.289993 250.330002 246.639999 250.110001 233.058990 2745300.0 HD 247.842999 247.494000 90.0
2020-07-13 251.919998 257.859985 249.089996 249.619995 232.602386 4437500.0 HD 248.703999 247.852500 86.0
2020-07-14 249.000000 258.179993 248.460007 257.790009 240.215408 4614200.0 HD 249.871001 248.674001 86.0
2020-07-15 260.140015 261.290009 255.149994 257.799988 240.224716 4343700.0 HD 250.600000 249.066500 86.0
2020-07-16 256.760010 260.500000 256.000000 258.079987 240.485641 2511200.0 HD 251.592999 249.427999 86.0
2020-07-17 260.029999 260.649994 257.720001 260.380005 242.628845 3091300.0 HD 252.781000 249.986499 86.0
2020-07-20 259.040009 261.200012 258.019989 260.170013 242.433228 2451100.0 HD 253.843001 250.662500 85.0
2020-07-21 261.619995 263.869995 260.720001 262.420013 244.529800 2439900.0 HD 255.350002 251.325500 85.0
2020-07-22 262.589996 265.589996 262.000000 265.170013 247.092316 2750000.0 HD 256.950003 252.068501 85.0
2020-07-23 267.799988 267.799988 261.799988 263.809998 245.824997 2680100.0 HD 258.535002 252.952501 85.0
2020-07-24 265.040009 266.890015 262.989990 265.309998 247.222717 2984500.0 HD 260.055002 253.949001 85.0
2020-07-27 265.089996 268.679993 265.089996 267.420013 249.188919 2412500.0 HD 261.835004 255.269501 85.0
2020-07-28 268.559998 269.070007 264.670013 265.279999 247.194794 2227000.0 HD 262.584003 256.227502 85.0
2020-07-29 264.799988 267.109985 264.170013 264.660004 246.617065 2874100.0 HD 263.270004 256.935002 85.0
2020-07-30 263.339996 267.350006 261.549988 266.309998 248.154556 2347800.0 HD 264.093005 257.843002 85.0
2020-07-31 265.000000 267.170013 260.609985 265.489990 247.390457 3640600.0 HD 264.604004 258.692502 85.0
2020-08-03 266.730011 268.579987 265.670013 266.179993 248.033463 2363500.0 HD 265.205002 259.524001 80.0
2020-08-04 266.630005 267.890015 263.839996 267.869995 249.608231 2224000.0 HD 265.750000 260.550001 80.0
2020-08-05 268.390015 268.390015 265.890015 267.480011 249.244797 1959900.0 HD 265.981000 261.465501 80.0
2020-08-06 266.600006 270.440002 266.529999 269.369995 251.005936 2203400.0 HD 266.537000 262.536001 80.0
2020-08-07 270.609985 274.920013 269.809998 271.640015 253.121140 2846300.0 HD 267.170001 263.612502 80.0
2020-08-10 272.420013 275.000000 271.799988 274.730011 256.000580 2393100.0 HD 267.901001 264.868002 79.0
2020-08-11 277.690002 279.369995 274.410004 274.920013 256.177612 3321300.0 HD 268.865002 265.724503 79.0
2020-08-12 279.750000 282.970001 276.959991 281.579987 262.383514 3867900.0 HD 270.557001 266.913503 79.0
2020-08-13 281.160004 282.649994 279.739990 281.660004 262.458069 2202400.0 HD 272.092001 268.092503 79.0
2020-08-14 281.140015 282.000000 279.190002 280.549988 261.423767 2490400.0 HD 273.598001 269.101003 79.0

Home Depot in Twitter:

!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
kw_list
['HD', 'Home Depot']

# The following code collects data related to 'HD' and Home Depot from 
# "markets" and "MarketWatch" twitter accounts published in the last three years

import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime as dt
from datetime import date

# Creating list to append tweet data to
attributes_container = []

sources_all =['CNBC','cnn', 'cnnbrk','MarketWatch', 'Benzinga', 'Stocktwits','BreakoutStocks', 
          'bespokeinvest','WSJMarkets','Stephanie_Link','nytimesbusiness','IBDinvestors','WSJDealJournal',
         'business', 'TheEconomist','WSJ', 'ABC', 'CBSNews','FoxNews', 'NBCNews']

sources=['markets','MarketWatch']
for tweet in sntwitter.TwitterSearchScraper('from:{} since:{} until:{}'.format(sources[0], (date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'), date.today().strftime('%Y-%m-%d'))).get_items():
    if 'Home Depot'.lower() in tweet.content.lower():
        attributes_container.append([tweet.date, tweet.content.split('http')[0]])
tweets_df = pd.DataFrame(attributes_container, columns=["Date Created", "Tweets"])
tweets_df.drop_duplicates(subset='Tweets',inplace=True)

attributes_container = []
for tweet in sntwitter.TwitterSearchScraper('from:{} since:{} until:{}'.format(sources[1], (date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'), date.today().strftime('%Y-%m-%d'))).get_items():
    if 'Home Depot'.lower() in tweet.content.lower():
        attributes_container.append([tweet.date, tweet.content.split('http')[0]])
# Creating a dataframe from the tweets list above 
tweets_df_marketwatch = pd.DataFrame(attributes_container, columns=["Date Created", "Tweets"])
print(tweets_df_marketwatch.shape)
tweets_df_marketwatch.drop_duplicates(subset='Tweets',inplace=True)
pd.set_option("max_colwidth", None)
tweets_df.head()
Date Created Tweets
0 2023-03-01 18:05:43+00:00 From Meta to Home Depot, corporate America is talking about AI on earnings calls
1 2023-02-21 19:06:41+00:00 Home Depot, Walmart and DocuSign.\n\n@RitikaGuptaTV has your stocks to watch this Tuesday
2 2023-02-21 15:15:07+00:00 Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers
3 2022-11-15 18:11:03+00:00 Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned
4 2022-08-16 15:09:26+00:00 Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off
tweets_df_marketwatch.shape
(1159, 3)

Home Depot in Google News:

!pip install pygooglenews 
kw_list
['Home Depot']

import pygooglenews
from pygooglenews import GoogleNews
gn = GoogleNews()
headlines_related = []

date_list = pd.date_range(end=date.today().strftime('%Y-%m-%d'), start=(date.today()-relativedelta(years=3)).strftime('%Y-%m-%d')).tolist()
clean_date = [str(i).split(" ")[0] for i in date_list]
for date in clean_date:
    headlines = []
    for word in kw_list:
        search =gn.search(word,when=date)
        for item in search['entries']:
            headlines.append(item['title'])
    #We have headlines of the news whose body may contain one of the keywords.
    #We will use only headlines to make predictions but some headlines do not contain any of the keywords.
    #Only include headlines that has one of the keywords
    for headline in headlines:
        for i in range(len(kw_list)):
            if kw_list[i] in headline:
                if headline not in headlines_related:
                    headlines_related.append((date,headline))
len(headlines_related)
1739
headlines_related[:10]
[('2020-06-08',
  "Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... - USA TODAY"),
 ('2020-06-17',
  "Here's the Defining Characteristic of Home Depot's Success - Nasdaq"),
 ('2020-06-17',
  'Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more - 9to5Toys'),
 ('2020-06-18',
  'Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more - 9to5Toys'),
 ('2020-06-24',
  '1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa - OCRegister'),
 ('2020-06-25',
  "'It's dehumanizing': Home Depot employee felt colleague's racist ... - Hamilton Spectator"),
 ('2020-06-30',
  'Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna-Glo ... - USA TODAY'),
 ('2020-06-30',
  'The Home Depot Announces Renewable Energy Goal and Pledges ... - PR Newswire'),
 ('2020-07-01',
  'Home Depot bans some rope sales after nooses were found tied on ... - Courier Journal'),
 ('2020-07-02',
  'Home Depot changes rope sales practice after nooses are found in store - CNN')]

df_news = pd.DataFrame(headlines_related, columns=['Date', 'Headline'])
df_news['Headline'] = df_news['Headline'].apply(lambda x: x.split('-')[0])
df_news['Date'] = pd.to_datetime(df_news['Date'])
print(df_news.shape)
df_news.drop_duplicates(subset='Headline', inplace=True)
print(df_news.shape)
df_news.head()
(1739, 2)
(1373, 2)
Date Headline
0 2020-06-08 Home Depot Father's Day sales: Save on DeWalt and Milwaukee ...
1 2020-06-17 Here's the Defining Characteristic of Home Depot's Success
2 2020-06-17 Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more
3 2020-06-18 Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more
4 2020-06-24 1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa

Sentiment Analysis of All Text Data:

Tweets:

!pip install vaderSentiment
tweets_df_marketwatch.drop("User Name", axis=1, inplace=True)
#combine tweets coming from markets, and marketwatch
markets = pd.concat([tweets_df, tweets_df_marketwatch])
markets.shape
(1169, 2)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer() 
def vader(row):
    comp = sid.polarity_scores(row)['compound']
    return comp
import re
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("amp;","&", x))
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("\n"," ", x))
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("\xa0"," ", x))
markets.head()
Date Created Tweets
0 2023-03-01 18:05:43+00:00 From Meta to Home Depot, corporate America is talking about AI on earnings calls
1 2023-02-21 19:06:41+00:00 Home Depot, Walmart and DocuSign. @RitikaGuptaTV has your stocks to watch this Tuesday
2 2023-02-21 15:15:07+00:00 Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers
3 2022-11-15 18:11:03+00:00 Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned
4 2022-08-16 15:09:26+00:00 Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off
markets['Date Created'] = markets['Date Created'].astype('str')
markets['Date Created'] = markets['Date Created'].apply(lambda x:x[:11])
markets.head()
Date Created Tweets
0 2023-03-01 From Meta to Home Depot, corporate America is talking about AI on earnings calls
1 2023-02-21 Home Depot, Walmart and DocuSign. @RitikaGuptaTV has your stocks to watch this Tuesday
2 2023-02-21 Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers
3 2022-11-15 Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned
4 2022-08-16 Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off
markets['Date Created'] = pd.to_datetime(markets['Date Created'])
markets.sort_values(by='Date Created', inplace=True)
markets.index =range(markets.shape[0])
markets.head()
Date Created Tweets
0 2020-06-06 What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH:
1 2020-06-08 Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says
2 2020-06-09 HD Supply misses on profit expectations but beats on sales
3 2020-06-11 Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel.
4 2020-06-11 Should I tell my sister that her husband, a notorious spender, has a secret credit card?
markets['VaderSent'] = markets['Tweets'].apply(vader)
markets
Date Created Tweets VaderSent
0 2020-06-06 What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH: -0.6310
1 2020-06-08 Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says 0.0000
2 2020-06-09 HD Supply misses on profit expectations but beats on sales 0.1280
3 2020-06-11 Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel. 0.0000
4 2020-06-11 Should I tell my sister that her husband, a notorious spender, has a secret credit card? -0.0772
... ... ... ...
1164 2023-05-30 Trans designer in Target anti-LGBTQ+ backlash says he was ‘dealt the worst hand’ -0.6249
1165 2023-05-31 Brown-Forman to invest $200 million to expand Tequila distillery in Jalisco, Mexico 0.3182
1166 2023-06-02 Zelle and Chase working to resolve duplicate-payments issue 0.3818
1167 2023-06-03 The ‘best job in America’ pays over $120,000 a year, offers low stress, healthy work-life balance — and its workers are in high demand -0.4019
1168 2023-06-05 Cava sets IPO terms, as restaurant chain is set to be valued at more than $2 billion 0.4404

1169 rows × 3 columns

#Vader gives a sentiment value between -1 and 1, -1 being the most negative, 
#1 being the most positive and 0 being neutral. We will put them in the corresponding
# pos, neu or neg bins based on this value.
def bins(value):
    hold_sent = []
    if -1 <= value < -0.33:
        hold_sent.append('neg')
    if -0.33<=value<=0.33:
        hold_sent.append('neu')
    if 0.33 < value <= 1:
        hold_sent.append('pos')
    return hold_sent
markets['sentiment'] = markets['VaderSent'].apply(bins)
values=[]
for i in markets['sentiment'].values:
    values.append(i[0])
markets['sentiment']=values
markets.head(10)
Date Created Tweets VaderSent sentiment
0 2020-06-06 What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH: -0.6310 neg
1 2020-06-08 Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says 0.0000 neu
2 2020-06-09 HD Supply misses on profit expectations but beats on sales 0.1280 neu
3 2020-06-11 Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel. 0.0000 neu
4 2020-06-11 Should I tell my sister that her husband, a notorious spender, has a secret credit card? -0.0772 neu
5 2020-06-12 Palantir Technologies Inc. is reportedly preparing to confidentially file for its long-awaited IPO. 0.0000 neu
6 2020-06-15 It's a well-known secret of Wall Street: Little business actually takes place in New York. We spoke to several Wall Street road warriors to learn what the past 3 months have been like when they can't fly anywhere. 0.3612 pos
7 2020-06-16 Dear airline passengers: Wear your face mask or you might get banned from flying. -0.1027 neu
8 2020-06-18 Facebook takes down Trump-Pence ads featuring symbols previously used by Nazis 0.0000 neu
9 2020-06-18 Dow opens with 170 point drop as jobless claims stay elevated -0.2732 neu
#Each week determine the weight of positive, neutral and negative tweets:
tweets_weight = pd.DataFrame(markets.groupby([pd.Grouper(key='Date Created', freq='W')])['sentiment'], columns=['Date','Data'])
tweets_weight.head()
for i in range(len(tweets_weight)):
    n = dict(tweets_weight.loc[i,'Data'].value_counts(normalize=True))
    if 'neg' not in n:
        n['neg']=0
    if 'pos' not in n:
        n['pos']=0
    if 'neu' not in n:
        n['neu']=0
    tweets_weight.loc[i,'tweet_pos'] = n['pos']
    tweets_weight.loc[i,'tweet_neg'] = n['neg']
    tweets_weight.loc[i,'tweet_neu'] = n['neu']
tweets_weight
Date Data tweet_pos tweet_neg tweet_neu
0 2020-06-07 0 neg Name: sentiment, dtype: object 0.000000 1.000000 0.000000
1 2020-06-14 1 neu 2 neu 3 neu 4 neu 5 neu Name: sentiment, dtype: object 0.000000 0.000000 1.000000
2 2020-06-21 6 pos 7 neu 8 neu 9 neu 10 pos 11 neu Name: sentiment, dtype: object 0.333333 0.000000 0.666667
3 2020-06-28 12 neu 13 neu 14 neu 15 neu 16 neg 17 neu 18 neu 19 neg 20 neu Name: sentiment, dtype: object 0.000000 0.222222 0.777778
4 2020-07-05 21 neg 22 neg 23 neg 24 neg 25 pos 26 neu 27 neg 28 neu 29 neu 30 neu Name: sentiment, dtype: object 0.100000 0.500000 0.400000
... ... ... ... ... ...
153 2023-05-14 1141 pos 1142 neu 1143 neu 1144 neu 1145 neg Name: sentiment, dtype: object 0.200000 0.200000 0.600000
154 2023-05-21 1146 neu 1147 neu 1148 neu 1149 neu 1150 neu 1151 neu 1152 neg 1153 neu 1154 neg 1155 neu 1156 pos 1157 neu 1158 neu 1159 neu 1160 neu Name: sentiment, dtype: object 0.066667 0.133333 0.800000
155 2023-05-28 1161 neg 1162 neg 1163 neg Name: sentiment, dtype: object 0.000000 1.000000 0.000000
156 2023-06-04 1164 neg 1165 neu 1166 pos 1167 neg Name: sentiment, dtype: object 0.250000 0.500000 0.250000
157 2023-06-11 1168 pos Name: sentiment, dtype: object 1.000000 0.000000 0.000000

158 rows × 5 columns

tweets_weight.drop('Data', inplace=True, axis=1)
tweets_weight.head()
Date tweet_pos tweet_neg tweet_neu
0 2020-06-07 0.000000 1.000000 0.000000
1 2020-06-14 0.000000 0.000000 1.000000
2 2020-06-21 0.333333 0.000000 0.666667
3 2020-06-28 0.000000 0.222222 0.777778
4 2020-07-05 0.100000 0.500000 0.400000
tweets_weight.plot(x='Date', y=['tweet_pos','tweet_neu','tweet_neg'],title='Tweets\' Sentiment')
<AxesSubplot: title={'center': "Tweets' Sentiment"}, xlabel='Date'>

Tweets were mostly neutral in tone. Unsurprisingly, there is a negative correlation between positive and negative tweets.

Google News:

df_news['VaderSent'] = df_news['Headline'].apply(vader)
df_news.head(10)
Date Headline VaderSent
0 2020-06-08 Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... 0.4939
1 2020-06-17 Here's the Defining Characteristic of Home Depot's Success 0.5719
2 2020-06-17 Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more 0.0000
3 2020-06-18 Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more 0.2500
4 2020-06-24 1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa -0.8860
5 2020-06-25 'It's dehumanizing': Home Depot employee felt colleague's racist ... -0.8126
6 2020-06-30 Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna 0.2023
7 2020-06-30 The Home Depot Announces Renewable Energy Goal and Pledges ... 0.2732
8 2020-07-01 Home Depot bans some rope sales after nooses were found tied on ... 0.0000
9 2020-07-02 Home Depot changes rope sales practice after nooses are found in store 0.0000
df_news['Date'] = pd.to_datetime(df_news['Date'])
df_news['sentiment'] = df_news['VaderSent'].apply(bins)
values=[]
for i in df_news['sentiment'].values:
    values.append(i[0])
df_news['sentiment']=values
df_news.head(10)
Date Headline VaderSent sentiment
0 2020-06-08 Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... 0.4939 pos
1 2020-06-17 Here's the Defining Characteristic of Home Depot's Success 0.5719 pos
2 2020-06-17 Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more 0.0000 neu
3 2020-06-18 Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more 0.2500 neu
4 2020-06-24 1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa -0.8860 neg
5 2020-06-25 'It's dehumanizing': Home Depot employee felt colleague's racist ... -0.8126 neg
6 2020-06-30 Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna 0.2023 neu
7 2020-06-30 The Home Depot Announces Renewable Energy Goal and Pledges ... 0.2732 neu
8 2020-07-01 Home Depot bans some rope sales after nooses were found tied on ... 0.0000 neu
9 2020-07-02 Home Depot changes rope sales practice after nooses are found in store 0.0000 neu
news_weight = pd.DataFrame(df_news.groupby([pd.Grouper(key='Date', freq='W')])['sentiment'], columns=['Date','Data'])
for i in range(len(news_weight)):
    n = dict(news_weight.loc[i,'Data'].value_counts(normalize=True))
    if 'neg' not in n:
        n['neg']=0
    if 'pos' not in n:
        n['pos']=0
    if 'neu' not in n:
        n['neu']=0
    news_weight.loc[i,'news_pos'] = n['pos']
    news_weight.loc[i,'news_neg'] = n['neg']
    news_weight.loc[i,'news_neu'] = n['neu']
news_weight
Date Data news_pos news_neg news_neu
0 2020-06-14 0 pos Name: sentiment, dtype: object 1.000000 0.000000 0.000000
1 2020-06-21 1 pos 2 neu 3 neu Name: sentiment, dtype: object 0.333333 0.000000 0.666667
2 2020-06-28 4 neg 5 neg Name: sentiment, dtype: object 0.000000 1.000000 0.000000
3 2020-07-05 6 neu 7 neu 8 neu 9 neu 10 neu 11 neg Name: sentiment, dtype: object 0.000000 0.166667 0.833333
4 2020-07-12 12 neu 13 neu Name: sentiment, dtype: object 0.000000 0.000000 1.000000
... ... ... ... ... ...
152 2023-05-14 1546 pos 1547 neu 1548 neu 1549 pos 1550 pos 1551 neu 1552 neu 1562 neu 1570 neu 1571 neg 1577 neu 1578 neu 1579 neu 1581 neu 1584 neg 1586 neu Name: sentiment, dtype: object 0.187500 0.125000 0.687500
153 2023-05-21 1587 neg 1589 neu 1590 neu 1592 neu 1593 neg 1594 neg 1595 neu 1596 neg 1597 neu 1598 neu 1599 neu 1600 neu 1601 pos 1602 neu 1603 neu 1604 neu 1605 neu 1606 neu 1608 neu 1609 neg 1610 neu 1611 neg 1614 neg 1615 pos 1616 neu 1619 neu 1620 neu 1621 neu 1623 pos 1631 neu 1632 neu 1633 neu Name: sentiment, dtype: object 0.093750 0.218750 0.687500
154 2023-05-28 1634 neu 1638 pos 1640 pos 1641 neu 1642 neu 1643 neg 1646 neu 1647 neu 1648 neu 1649 neg 1650 neu 1658 neu 1659 neg 1666 neu 1667 neu 1668 neu 1669 neu 1670 neu 1671 neg 1672 neu 1673 neg 1674 neg 1676 pos Name: sentiment, dtype: object 0.130435 0.260870 0.608696
155 2023-06-04 1677 pos 1679 neg 1680 neu 1682 neu 1683 neg 1685 neu 1692 neu 1694 neg 1696 neg 1698 pos 1699 pos 1700 pos 1704 pos 1705 neu 1706 neg 1708 neu 1710 pos 1711 neu 1713 neu 1716 neu 1718 neu 1719 neg 1721 neu 1722 neu 1724 neu Name: sentiment, dtype: object 0.240000 0.240000 0.520000
156 2023-06-11 1725 pos 1726 neu 1727 neu 1728 neu 1731 neg 1732 neu 1733 neg 1734 neu 1735 neu 1736 neg 1737 neg Name: sentiment, dtype: object 0.090909 0.363636 0.545455

157 rows × 5 columns

news_weight.drop('Data', inplace=True, axis=1)
news_weight
Date news_pos news_neg news_neu
0 2020-06-14 1.000000 0.000000 0.000000
1 2020-06-21 0.333333 0.000000 0.666667
2 2020-06-28 0.000000 1.000000 0.000000
3 2020-07-05 0.000000 0.166667 0.833333
4 2020-07-12 0.000000 0.000000 1.000000
... ... ... ... ...
152 2023-05-14 0.187500 0.125000 0.687500
153 2023-05-21 0.093750 0.218750 0.687500
154 2023-05-28 0.130435 0.260870 0.608696
155 2023-06-04 0.240000 0.240000 0.520000
156 2023-06-11 0.090909 0.363636 0.545455

157 rows × 4 columns

news_weight.plot(x='Date', y=['news_pos','news_neg','news_neu'], title='News Sentiment')
<AxesSubplot: title={'center': 'News Sentiment'}, xlabel='Date'>

News were also mostly neutral in tone. Again, there is a negative correlation between positive and negative tweets.

df_combined.drop('ticker', inplace=True, axis=1)
df_combined.head()
open high low close adjclose volume TwoWeeks Month Home Depot
2020-07-06 250.270004 251.500000 247.039993 249.550003 232.537186 3133800.0 247.481999 248.1370 90.0
2020-07-07 247.369995 250.779999 247.070007 247.350006 230.487167 2927800.0 247.300999 247.6660 90.0
2020-07-08 247.869995 249.789993 246.220001 249.169998 232.183075 2294000.0 247.187000 247.2865 90.0
2020-07-09 249.660004 250.509995 246.350006 247.960007 231.055588 2994700.0 247.370000 246.9620 90.0
2020-07-10 248.289993 250.330002 246.639999 250.110001 233.058990 2745300.0 247.842999 247.4940 90.0
df_combined = df_combined.reset_index()
df_combined.rename(columns={'index':'Date'}, inplace=True)
df_combined.head()
Date open high low close adjclose volume TwoWeeks Month Home Depot
0 2020-07-06 250.270004 251.500000 247.039993 249.550003 232.537186 3133800.0 247.481999 248.1370 90.0
1 2020-07-07 247.369995 250.779999 247.070007 247.350006 230.487167 2927800.0 247.300999 247.6660 90.0
2 2020-07-08 247.869995 249.789993 246.220001 249.169998 232.183075 2294000.0 247.187000 247.2865 90.0
3 2020-07-09 249.660004 250.509995 246.350006 247.960007 231.055588 2994700.0 247.370000 246.9620 90.0
4 2020-07-10 248.289993 250.330002 246.639999 250.110001 233.058990 2745300.0 247.842999 247.4940 90.0
import numpy as np
df_weekly = pd.DataFrame(df_combined.groupby([pd.Grouper(key='Date', freq='W')]).agg(np.mean))
df_weekly.head()
open high low close adjclose volume TwoWeeks Month Home Depot
Date
2020-07-12 248.691998 250.581998 246.664001 248.828003 231.864401 2819120.0 247.436599 247.509100 90.0
2020-07-19 255.570004 259.695996 253.284000 256.733997 239.231400 3799580.0 250.709800 249.001500 86.0
2020-07-26 263.217999 265.070001 261.105994 263.376007 245.420612 2661120.0 256.946602 252.191601 85.0
2020-08-02 265.357996 267.876001 263.217999 265.832001 247.709158 2700400.0 263.277204 256.993502 85.0
2020-08-09 267.792004 270.044006 266.348004 268.508002 250.202713 2319420.0 266.128600 261.537601 80.0
df_final = df_weekly.merge(news_weight, on='Date').merge(tweets_weight, on='Date')
df_final.columns = [i.lower() for i in df_final.columns]
df_final.head()
date open high low close adjclose volume twoweeks month home depot news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 248.691998 250.581998 246.664001 248.828003 231.864401 2819120.0 247.436599 247.509100 90.0 0.000000 0.00 1.000000 0.000000 0.166667 0.833333
1 2020-07-19 255.570004 259.695996 253.284000 256.733997 239.231400 3799580.0 250.709800 249.001500 86.0 0.000000 0.75 0.250000 0.000000 0.400000 0.600000
2 2020-07-26 263.217999 265.070001 261.105994 263.376007 245.420612 2661120.0 256.946602 252.191601 85.0 0.333333 0.00 0.666667 0.125000 0.125000 0.750000
3 2020-08-02 265.357996 267.876001 263.217999 265.832001 247.709158 2700400.0 263.277204 256.993502 85.0 0.200000 0.20 0.600000 0.181818 0.272727 0.545455
4 2020-08-09 267.792004 270.044006 266.348004 268.508002 250.202713 2319420.0 266.128600 261.537601 80.0 0.333333 0.00 0.666667 0.333333 0.222222 0.444444
df_final.rename(columns={"home depot":"trends"}, inplace=True)
df_final.head()
date open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 248.691998 250.581998 246.664001 248.828003 231.864401 2819120.0 247.436599 247.509100 90.0 0.000000 0.00 1.000000 0.000000 0.166667 0.833333
1 2020-07-19 255.570004 259.695996 253.284000 256.733997 239.231400 3799580.0 250.709800 249.001500 86.0 0.000000 0.75 0.250000 0.000000 0.400000 0.600000
2 2020-07-26 263.217999 265.070001 261.105994 263.376007 245.420612 2661120.0 256.946602 252.191601 85.0 0.333333 0.00 0.666667 0.125000 0.125000 0.750000
3 2020-08-02 265.357996 267.876001 263.217999 265.832001 247.709158 2700400.0 263.277204 256.993502 85.0 0.200000 0.20 0.600000 0.181818 0.272727 0.545455
4 2020-08-09 267.792004 270.044006 266.348004 268.508002 250.202713 2319420.0 266.128600 261.537601 80.0 0.333333 0.00 0.666667 0.333333 0.222222 0.444444
#Scaling the data
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()
df_final_scaled = pd.DataFrame(SS.fit_transform(df_final.iloc[:,1:]), columns=df_final.columns[1:])
df_final_scaled.insert(0, "date", df_final['date'])
df_final_scaled.head()
date open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 -1.738792 -1.736071 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049
1 2020-07-19 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 -1.644438 -1.692888 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837
2 2020-07-26 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 -1.464655 -1.600581 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759
3 2020-08-02 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 -1.282168 -1.461636 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320
4 2020-08-09 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 -1.199974 -1.330150 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638
df_final_scaled.corr().round(3)
<ipython-input-214-f6a92ee3555b>:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df_final_scaled.corr().round(3)
open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
open 1.000 0.999 0.999 0.998 0.986 -0.051 0.983 0.954 -0.221 0.241 -0.043 -0.148 0.009 -0.075 0.058
high 0.999 1.000 0.998 0.999 0.988 -0.040 0.982 0.954 -0.227 0.246 -0.038 -0.156 0.010 -0.074 0.056
low 0.999 0.998 1.000 0.999 0.985 -0.075 0.978 0.947 -0.210 0.238 -0.045 -0.145 0.008 -0.079 0.062
close 0.998 0.999 0.999 1.000 0.988 -0.059 0.978 0.949 -0.218 0.245 -0.041 -0.152 0.009 -0.077 0.059
adjclose 0.986 0.988 0.985 0.988 1.000 -0.042 0.970 0.945 -0.321 0.267 -0.013 -0.192 0.011 -0.072 0.053
volume -0.051 -0.040 -0.075 -0.059 -0.042 1.000 0.018 0.058 -0.057 0.048 -0.005 -0.032 0.014 0.090 -0.092
twoweeks 0.983 0.982 0.978 0.978 0.970 0.018 1.000 0.986 -0.266 0.248 -0.035 -0.160 0.006 -0.077 0.062
month 0.954 0.954 0.947 0.949 0.945 0.058 0.986 1.000 -0.311 0.229 -0.034 -0.146 -0.003 -0.074 0.068
trends -0.221 -0.227 -0.210 -0.218 -0.321 -0.057 -0.266 -0.311 1.000 -0.227 -0.072 0.227 -0.015 0.006 0.008
news_pos 0.241 0.246 0.238 0.245 0.267 0.048 0.248 0.229 -0.227 1.000 -0.151 -0.637 -0.074 0.002 0.066
news_neg -0.043 -0.038 -0.045 -0.041 -0.013 -0.005 -0.035 -0.034 -0.072 -0.151 1.000 -0.666 -0.004 -0.005 0.008
news_neu -0.148 -0.156 -0.145 -0.152 -0.192 -0.032 -0.160 -0.146 0.227 -0.637 -0.666 1.000 0.059 0.003 -0.056
tweet_pos 0.009 0.010 0.008 0.009 0.011 0.014 0.006 -0.003 -0.015 -0.074 -0.004 0.059 1.000 -0.378 -0.581
tweet_neg -0.075 -0.074 -0.079 -0.077 -0.072 0.090 -0.077 -0.074 0.006 0.002 -0.005 0.003 -0.378 1.000 -0.534
tweet_neu 0.058 0.056 0.062 0.059 0.053 -0.092 0.062 0.068 0.008 0.066 0.008 -0.056 -0.581 -0.534 1.000

There is a positive correlation between positive news and the stock price in two weeks. We don't see this correlation with positive tweets or any other sentiment columns. Additionally, there is a negative correlation between trends and the stock price in two weeks. This means when people start searching Home Depot on Google News, stock prices starting falling in two weeks.

cols=['news_pos', 'news_neg','news_neu', 'tweet_pos', 'tweet_neg','tweet_neu', 'trends']
fig, ax =plt.subplots(3,3, figsize=(15, 12))
ax=ax.ravel()
for i in range(len(cols)):
    df_final_scaled.plot(x='date', y=['close','twoweeks','month', cols[i]], ax=ax[i], title=cols[i])
    ax[7].axis('off')
    ax[8].axis('off')
plt.tight_layout()

\We can observe the correlation mentioned above in the plots as well. The first plot, representing positive news, indicates that positive news loosely follows the stock price. On the other hand, the last plot depicting trends shows a negative correlation between Google searches and the stock price. Furthermore, over time, the number of Google searches for "Home Depot" has steadily declined.

Applying Linear Regression on Historical Price Data:

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use("ggplot")
df_final_scaled = pd.read_csv('df_final_home_depot_scaled.csv')
df_final_scaled.drop('Unnamed: 0', inplace=True, axis=1)
df_final_scaled.head()
date open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 -1.738792 -1.736071 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049
1 2020-07-19 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 -1.644438 -1.692888 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837
2 2020-07-26 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 -1.464655 -1.600581 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759
3 2020-08-02 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 -1.282168 -1.461636 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320
4 2020-08-09 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 -1.199974 -1.330150 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638
x= df_final_scaled[['open','high','low','close', 'adjclose','volume','news_pos', 'news_neg', 'news_neu', 'tweet_pos',
       'tweet_neg', 'tweet_neu', 'trends']]
y=df_final_scaled['twoweeks']
import statsmodels.api as sm
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               twoweeks   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.975
Method:                 Least Squares   F-statistic:                     537.8
Date:                Mon, 12 Jun 2023   Prob (F-statistic):          3.14e-109
Time:                        02:36:26   Log-Likelihood:                 70.561
No. Observations:                 153   AIC:                            -117.1
Df Residuals:                     141   BIC:                            -80.76
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.501e-16      0.013  -1.17e-14      1.000      -0.025       0.025
open           2.3468      0.511      4.594      0.000       1.337       3.357
high          -0.2428      0.541     -0.448      0.655      -1.313       0.828
low           -0.7347      0.559     -1.314      0.191      -1.840       0.371
close         -0.2749      0.545     -0.504      0.615      -1.352       0.802
adjclose      -0.1252      0.118     -1.064      0.289      -0.358       0.107
volume         0.0494      0.017      2.981      0.003       0.017       0.082
news_pos       0.0014      0.010      0.141      0.888      -0.019       0.022
news_neg       0.0043      0.010      0.438      0.662      -0.015       0.023
news_neu      -0.0044      0.007     -0.594      0.554      -0.019       0.010
tweet_pos     -0.0020      0.009     -0.229      0.819      -0.019       0.015
tweet_neg     -0.0071      0.009     -0.778      0.438      -0.025       0.011
tweet_neu      0.0081      0.008      1.009      0.315      -0.008       0.024
trends        -0.0524      0.019     -2.785      0.006      -0.090      -0.015
==============================================================================
Omnibus:                        1.461   Durbin-Watson:                   1.694
Prob(Omnibus):                  0.482   Jarque-Bera (JB):                1.534
Skew:                           0.193   Prob(JB):                        0.464
Kurtosis:                       2.697   Cond. No.                     5.93e+15
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.26e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
x= df_final_scaled[['open','high','low','close', 'adjclose','volume']]
y=df_final_scaled['twoweeks']
from sklearn.model_selection import train_test_split 
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3 , shuffle=False,random_state = 0)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
regression = LinearRegression()
regression.fit(train_x, train_y)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)
regression coefficient [ 2.15513818  0.1568129  -0.63183193 -0.94351993  0.2494587   0.05989017]
regression intercept 0.017827179785784394
regression_confidence = regression.score(test_x, test_y)
print("linear regression confidence: ", regression_confidence)
linear regression confidence:  0.8354280496114086
predicted=regression.predict(test_x)
print(test_x.head())
         open      high       low     close  adjclose    volume
107 -0.243796 -0.255966 -0.239776 -0.208430 -0.104242 -0.699127
108 -0.115385 -0.079953 -0.082182 -0.058794  0.042456 -0.785429
109  0.099918  0.070874  0.110736  0.082941  0.181410 -0.899154
110  0.366759  0.437578  0.380624  0.428573  0.520257  0.542134
111  0.080282  0.039007  0.007433 -0.028003  0.072644 -0.737113
dfr=pd.DataFrame({'Actual_Price':test_y, 'Predicted_Price':predicted})
dfr.head(10)
Actual_Price Predicted_Price
107 -0.251375 -0.267445
108 -0.142925 -0.172431
109 -0.051125 0.087459
110 0.166720 0.394254
111 0.250333 0.192664
112 -0.043019 -0.309534
113 -0.333535 -0.397486
114 -0.476311 -0.423891
115 -0.740981 -0.954391
116 -0.992016 -0.840699
dfr.describe()
Actual_Price Predicted_Price
count 46.000000 46.000000
mean -0.191756 -0.132461
std 0.438800 0.433547
min -0.992016 -0.954391
25% -0.502699 -0.417290
50% -0.331794 -0.271980
75% 0.274308 0.247046
max 0.478988 0.858482
from sklearn import metrics
import numpy as np
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test_y, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(test_y, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(test_y, predicted)))
Mean Absolute Error (MAE): 0.14249150773348154
Mean Squared Error (MSE) : 0.030998755667901733
Root Mean Squared Error (RMSE): 0.17606463491542454
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Prediction using only Historical Data")
plt.legend()
<matplotlib.legend.Legend at 0x7fc3b62ce770>

Applying Linear Regression to both historical and newly added features:

df_final_scaled.columns
Index(['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
       'twoweeks', 'month', 'trends', 'news_pos', 'news_neg', 'news_neu',
       'tweet_pos', 'tweet_neg', 'tweet_neu'],
      dtype='object')
x= df_final_scaled[['open','high','low','close', 'adjclose','volume','news_pos', 'news_neg', 'news_neu', 'tweet_pos',
       'tweet_neg', 'tweet_neu', 'trends']]

y=df_final_scaled['twoweeks']
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3 , shuffle=False,random_state = 0)
regression = LinearRegression()
regression.fit(train_x, train_y)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)
regression coefficient [ 1.91871810e+00  1.89637622e-01 -3.49725608e-01 -8.35917172e-01
  4.63540880e-02  6.48403899e-02  1.45467759e-03  5.40953328e-03
 -5.31782650e-03 -3.13379351e-03 -7.57685954e-03  9.52332816e-03
 -5.55338621e-02]
regression intercept 0.023765631258188702
regression_confidence = regression.score(test_x, test_y)
print("linear regression confidence: ", regression_confidence)
linear regression confidence:  0.8294859517339385
predicted=regression.predict(test_x)
print(test_x.head())
         open      high       low     close  adjclose    volume  news_pos  \
107 -0.243796 -0.255966 -0.239776 -0.208430 -0.104242 -0.699127  0.332721   
108 -0.115385 -0.079953 -0.082182 -0.058794  0.042456 -0.785429 -1.106820   
109  0.099918  0.070874  0.110736  0.082941  0.181410 -0.899154  0.332721   
110  0.366759  0.437578  0.380624  0.428573  0.520257  0.542134  0.512664   
111  0.080282  0.039007  0.007433 -0.028003  0.072644 -0.737113  0.512664   

     news_neg  news_neu  tweet_pos  tweet_neg  tweet_neu    trends  
107  0.894529 -0.948910   0.533344   1.071344  -1.428981 -0.475612  
108  0.894529  0.137234   0.824251  -1.044107   0.165137 -0.380115  
109  0.894529 -0.948910  -1.503002   0.567665   0.873634 -0.666606  
110 -0.802239  0.239060   0.471637  -0.018434  -0.414542 -0.666606  
111  0.372447 -0.677374   1.890908  -0.338957  -1.428981 -0.475612  
from sklearn.metrics import r2_score
r2_score(predicted, test_y)
0.8290817124619406
dfr=pd.DataFrame({'Actual_Price':test_y, 'Predicted_Price':predicted})
dfr.head(10)
Actual_Price Predicted_Price
107 -0.251375 -0.271245
108 -0.142925 -0.153351
109 -0.051125 0.127086
110 0.166720 0.405224
111 0.250333 0.177392
112 -0.043019 -0.300313
113 -0.333535 -0.381184
114 -0.476311 -0.385414
115 -0.740981 -0.939092
116 -0.992016 -0.805985
dfr.describe()
Actual_Price Predicted_Price
count 46.000000 46.000000
mean -0.191756 -0.112709
std 0.438800 0.438281
min -0.992016 -0.939092
25% -0.502699 -0.419594
50% -0.331794 -0.269632
75% 0.274308 0.268095
max 0.478988 0.895990
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test_y, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(test_y, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(test_y, predicted)))
Mean Absolute Error (MAE): 0.14758907019543632
Mean Squared Error (MSE) : 0.0321180086136408
Root Mean Squared Error (RMSE): 0.17921497876472492
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Predictions using  Historical, Sentiment and Trends")
plt.legend()
<matplotlib.legend.Legend at 0x7fc3b6000850>

Applying CNN+LSTM on Historical Data:

import pandas as pd
df = pd.read_csv('df_final_home_depot_scaled.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)
df.head()
date open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 -1.738792 -1.736071 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049
1 2020-07-19 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 -1.644438 -1.692888 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837
2 2020-07-26 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 -1.464655 -1.600581 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759
3 2020-08-02 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 -1.282168 -1.461636 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320
4 2020-08-09 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 -1.199974 -1.330150 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638
df.columns
Index(['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
       'twoweeks', 'month', 'trends', 'news_pos', 'news_neg', 'news_neu',
       'tweet_pos', 'tweet_neg', 'tweet_neu'],
      dtype='object')
df_hist = df[['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
       'twoweeks', 'month']]
df_hist.shape
(153, 9)
#Append each row to X (high, low, open, close, adjclose, volume)
X = [[df_hist.iloc[j,i+1] for i in range(df_hist.shape[1]-3)] for j in range(df_hist.shape[0])]
#Append all "twoweeks" values
Y = [df_hist.iloc[i,7] for i in range(df_hist.shape[0])]
import numpy as np
from sklearn.model_selection import train_test_split 


x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)

train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)
train_X = train_X.reshape(train_X.shape[0],1,6,1)
test_X = test_X.reshape(test_X.shape[0],1,6,1)
print(len(train_X))
print(len(test_X))
107
46
train_X[0]
array([[[-1.09683374],
        [-1.10216509],
        [-1.12420655],
        [-1.11575665],
        [-1.28069411],
        [-0.48313096]]])
from tensorflow.keras import backend as K
K.image_data_format()=="channels_first"
False
# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError

model = tf.keras.Sequential()

# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None, 6, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
# model.add(Dense(5, kernel_regularizer=L2(0.01)))

# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))

#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)
Epoch 1/40
3/3 [==============================] - 13s 910ms/step - loss: 0.8740 - mse: 0.8740 - mae: 0.7390 - val_loss: 1.1876 - val_mse: 1.1876 - val_mae: 0.8232
Epoch 2/40
3/3 [==============================] - 0s 34ms/step - loss: 0.8003 - mse: 0.8003 - mae: 0.7047 - val_loss: 1.0375 - val_mse: 1.0375 - val_mae: 0.7627
Epoch 3/40
3/3 [==============================] - 0s 38ms/step - loss: 0.6692 - mse: 0.6692 - mae: 0.6315 - val_loss: 0.7258 - val_mse: 0.7258 - val_mae: 0.6202
Epoch 4/40
3/3 [==============================] - 0s 42ms/step - loss: 0.3853 - mse: 0.3853 - mae: 0.4627 - val_loss: 0.2971 - val_mse: 0.2971 - val_mae: 0.3433
Epoch 5/40
3/3 [==============================] - 0s 41ms/step - loss: 0.1832 - mse: 0.1832 - mae: 0.2912 - val_loss: 0.1127 - val_mse: 0.1127 - val_mae: 0.2656
Epoch 6/40
3/3 [==============================] - 0s 46ms/step - loss: 0.1054 - mse: 0.1054 - mae: 0.2555 - val_loss: 0.2310 - val_mse: 0.2310 - val_mae: 0.3723
Epoch 7/40
3/3 [==============================] - 0s 67ms/step - loss: 0.1590 - mse: 0.1590 - mae: 0.3158 - val_loss: 0.1858 - val_mse: 0.1858 - val_mae: 0.3358
Epoch 8/40
3/3 [==============================] - 0s 60ms/step - loss: 0.1221 - mse: 0.1221 - mae: 0.2605 - val_loss: 0.0740 - val_mse: 0.0740 - val_mae: 0.2339
Epoch 9/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0769 - mse: 0.0769 - mae: 0.2229 - val_loss: 0.0902 - val_mse: 0.0902 - val_mae: 0.2187
Epoch 10/40
3/3 [==============================] - 0s 81ms/step - loss: 0.0740 - mse: 0.0740 - mae: 0.1926 - val_loss: 0.1062 - val_mse: 0.1062 - val_mae: 0.2100
Epoch 11/40
3/3 [==============================] - 0s 64ms/step - loss: 0.0807 - mse: 0.0807 - mae: 0.1986 - val_loss: 0.0893 - val_mse: 0.0893 - val_mae: 0.2025
Epoch 12/40
3/3 [==============================] - 0s 61ms/step - loss: 0.0891 - mse: 0.0891 - mae: 0.2086 - val_loss: 0.0575 - val_mse: 0.0575 - val_mae: 0.1793
Epoch 13/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0599 - mse: 0.0599 - mae: 0.1769 - val_loss: 0.0513 - val_mse: 0.0513 - val_mae: 0.1723
Epoch 14/40
3/3 [==============================] - 0s 65ms/step - loss: 0.0515 - mse: 0.0515 - mae: 0.1749 - val_loss: 0.0740 - val_mse: 0.0740 - val_mae: 0.2093
Epoch 15/40
3/3 [==============================] - 0s 57ms/step - loss: 0.0550 - mse: 0.0550 - mae: 0.1783 - val_loss: 0.0788 - val_mse: 0.0788 - val_mae: 0.2178
Epoch 16/40
3/3 [==============================] - 0s 56ms/step - loss: 0.0655 - mse: 0.0655 - mae: 0.1941 - val_loss: 0.0548 - val_mse: 0.0548 - val_mae: 0.1837
Epoch 17/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0431 - mse: 0.0431 - mae: 0.1603 - val_loss: 0.0412 - val_mse: 0.0412 - val_mae: 0.1610
Epoch 18/40
3/3 [==============================] - 0s 57ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1512 - val_loss: 0.0473 - val_mse: 0.0473 - val_mae: 0.1615
Epoch 19/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0426 - mse: 0.0426 - mae: 0.1599 - val_loss: 0.0502 - val_mse: 0.0502 - val_mae: 0.1643
Epoch 20/40
3/3 [==============================] - 0s 61ms/step - loss: 0.0579 - mse: 0.0579 - mae: 0.1759 - val_loss: 0.0423 - val_mse: 0.0423 - val_mae: 0.1570
Epoch 21/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0507 - mse: 0.0507 - mae: 0.1801 - val_loss: 0.0473 - val_mse: 0.0473 - val_mae: 0.1655
Epoch 22/40
3/3 [==============================] - 0s 71ms/step - loss: 0.0399 - mse: 0.0399 - mae: 0.1639 - val_loss: 0.0588 - val_mse: 0.0588 - val_mae: 0.1883
Epoch 23/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0468 - mse: 0.0468 - mae: 0.1620 - val_loss: 0.0428 - val_mse: 0.0428 - val_mae: 0.1614
Epoch 24/40
3/3 [==============================] - 0s 35ms/step - loss: 0.0417 - mse: 0.0417 - mae: 0.1593 - val_loss: 0.0419 - val_mse: 0.0419 - val_mae: 0.1566
Epoch 25/40
3/3 [==============================] - 0s 40ms/step - loss: 0.0481 - mse: 0.0481 - mae: 0.1619 - val_loss: 0.0418 - val_mse: 0.0418 - val_mae: 0.1554
Epoch 26/40
3/3 [==============================] - 0s 40ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1476 - val_loss: 0.0420 - val_mse: 0.0420 - val_mae: 0.1560
Epoch 27/40
3/3 [==============================] - 0s 38ms/step - loss: 0.0471 - mse: 0.0471 - mae: 0.1628 - val_loss: 0.0429 - val_mse: 0.0429 - val_mae: 0.1625
Epoch 28/40
3/3 [==============================] - 0s 35ms/step - loss: 0.0461 - mse: 0.0461 - mae: 0.1648 - val_loss: 0.0434 - val_mse: 0.0434 - val_mae: 0.1660
Epoch 29/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0360 - mse: 0.0360 - mae: 0.1533 - val_loss: 0.0427 - val_mse: 0.0427 - val_mae: 0.1616
Epoch 30/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0376 - mse: 0.0376 - mae: 0.1511 - val_loss: 0.0431 - val_mse: 0.0431 - val_mae: 0.1578
Epoch 31/40
3/3 [==============================] - 0s 37ms/step - loss: 0.0435 - mse: 0.0435 - mae: 0.1703 - val_loss: 0.0448 - val_mse: 0.0448 - val_mae: 0.1599
Epoch 32/40
3/3 [==============================] - 0s 39ms/step - loss: 0.0414 - mse: 0.0414 - mae: 0.1560 - val_loss: 0.0459 - val_mse: 0.0459 - val_mae: 0.1602
Epoch 33/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0437 - mse: 0.0437 - mae: 0.1640 - val_loss: 0.0477 - val_mse: 0.0477 - val_mae: 0.1672
Epoch 34/40
3/3 [==============================] - 0s 43ms/step - loss: 0.0421 - mse: 0.0421 - mae: 0.1515 - val_loss: 0.0508 - val_mse: 0.0508 - val_mae: 0.1743
Epoch 35/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0427 - mse: 0.0427 - mae: 0.1623 - val_loss: 0.0461 - val_mse: 0.0461 - val_mae: 0.1658
Epoch 36/40
3/3 [==============================] - 0s 34ms/step - loss: 0.0444 - mse: 0.0444 - mae: 0.1668 - val_loss: 0.0426 - val_mse: 0.0426 - val_mae: 0.1588
Epoch 37/40
3/3 [==============================] - 0s 38ms/step - loss: 0.0468 - mse: 0.0468 - mae: 0.1661 - val_loss: 0.0496 - val_mse: 0.0496 - val_mae: 0.1661
Epoch 38/40
3/3 [==============================] - 0s 38ms/step - loss: 0.0431 - mse: 0.0431 - mae: 0.1544 - val_loss: 0.0435 - val_mse: 0.0435 - val_mae: 0.1588
Epoch 39/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1587 - val_loss: 0.0440 - val_mse: 0.0440 - val_mae: 0.1577
Epoch 40/40
3/3 [==============================] - 0s 34ms/step - loss: 0.0546 - mse: 0.0546 - mae: 0.1726 - val_loss: 0.0544 - val_mse: 0.0544 - val_mae: 0.1774
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc355192470>
plt.plot(history.history['mse'], label='train mse')
plt.plot(history.history['val_mse'], label='val mse')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc355252b30>
plt.plot(history.history['mae'], label='train mae')
plt.plot(history.history['val_mae'], label='val mae')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc355588a60>
model.evaluate(test_X, test_Y)
2/2 [==============================] - 0s 10ms/step - loss: 0.0544 - mse: 0.0544 - mae: 0.1774
[0.054426856338977814, 0.054426856338977814, 0.1773832142353058]
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]

var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)

r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)

var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)
Variance: 0.962742
R2 Score: 0.962742
Max Error: 0.613890
predicted  = model.predict(test_X)
test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
    temp = df_hist.iloc[j,3]
    test_label[j - len_t] = test_label[j - len_t] * temp + temp
    predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted  Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()
2/2 [==============================] - 0s 7ms/step

Applying CNN+LSTM on Historical+News+Tweets+Trends data:

df.head()
date open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 -1.738792 -1.736071 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049
1 2020-07-19 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 -1.644438 -1.692888 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837
2 2020-07-26 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 -1.464655 -1.600581 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759
3 2020-08-02 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 -1.282168 -1.461636 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320
4 2020-08-09 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 -1.199974 -1.330150 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638
df_x = df.drop(['date','twoweeks', 'month'], axis=1)
df_x.head()
open high low close adjclose volume trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049
1 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837
2 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759
3 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320
4 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638
X = [[df_x.iloc[j, i] for i in range(df_x.shape[1])] for j in range(df_x.shape[0])]
Y = [df.iloc[i,7] for i in range(df.shape[0])] 
print(X[0:2])
print(Y[0:2])
[[-1.7038183261524975, -1.7302608232721683, -1.6829499939918715, -1.7057639670236395, -1.872820057485404, -0.8403922760048991, 2.580289089276951, -1.1068195873526132, -1.1938008487946932, 1.7664503220919647, -1.5030018624207688, -0.1039065146895656, 1.464048694140032], [-1.5057895100199, -1.470010607913253, -1.4909599654415322, -1.4778224513157197, -1.6589052190920777, -0.12416025602251, 2.198301444710552, -1.1068195873526132, 3.504942032500697, -1.8992863327148637, -1.5030018624207688, 1.2123738405060045, 0.3068368015117597]]
[-1.738791872212694, -1.6444381302921762]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)

train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)

train_X = train_X.reshape(train_X.shape[0],1,13,1)
test_X = test_X.reshape(test_X.shape[0],1,13,1)

print(len(train_X))
print(len(test_X))
107
46
# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError

model = tf.keras.Sequential()

# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None,13, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
#model.add(Dense(5, kernel_regularizer=L2(0.01)))

# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))

#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)
Epoch 1/40
3/3 [==============================] - 14s 1s/step - loss: 1.1132 - mse: 1.1132 - mae: 0.7954 - val_loss: 0.6801 - val_mse: 0.6801 - val_mae: 0.7047
Epoch 2/40
3/3 [==============================] - 0s 47ms/step - loss: 1.0145 - mse: 1.0145 - mae: 0.7659 - val_loss: 0.5992 - val_mse: 0.5992 - val_mae: 0.6581
Epoch 3/40
3/3 [==============================] - 0s 49ms/step - loss: 0.8404 - mse: 0.8404 - mae: 0.7018 - val_loss: 0.3991 - val_mse: 0.3991 - val_mae: 0.5249
Epoch 4/40
3/3 [==============================] - 0s 49ms/step - loss: 0.4299 - mse: 0.4299 - mae: 0.5080 - val_loss: 0.0980 - val_mse: 0.0980 - val_mae: 0.2482
Epoch 5/40
3/3 [==============================] - 0s 49ms/step - loss: 0.1155 - mse: 0.1155 - mae: 0.2599 - val_loss: 0.3027 - val_mse: 0.3027 - val_mae: 0.4706
Epoch 6/40
3/3 [==============================] - 0s 46ms/step - loss: 0.2489 - mse: 0.2489 - mae: 0.4039 - val_loss: 0.1436 - val_mse: 0.1436 - val_mae: 0.3104
Epoch 7/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0943 - mse: 0.0943 - mae: 0.2432 - val_loss: 0.0345 - val_mse: 0.0345 - val_mae: 0.1563
Epoch 8/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0843 - mse: 0.0843 - mae: 0.2124 - val_loss: 0.0730 - val_mse: 0.0730 - val_mae: 0.2204
Epoch 9/40
3/3 [==============================] - 0s 43ms/step - loss: 0.1079 - mse: 0.1079 - mae: 0.2609 - val_loss: 0.0979 - val_mse: 0.0979 - val_mae: 0.2547
Epoch 10/40
3/3 [==============================] - 0s 50ms/step - loss: 0.0937 - mse: 0.0937 - mae: 0.2384 - val_loss: 0.0860 - val_mse: 0.0860 - val_mae: 0.2278
Epoch 11/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0645 - mse: 0.0645 - mae: 0.2031 - val_loss: 0.0639 - val_mse: 0.0639 - val_mae: 0.1807
Epoch 12/40
3/3 [==============================] - 0s 50ms/step - loss: 0.0624 - mse: 0.0624 - mae: 0.1883 - val_loss: 0.0604 - val_mse: 0.0604 - val_mae: 0.1955
Epoch 13/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0650 - mse: 0.0650 - mae: 0.1957 - val_loss: 0.0530 - val_mse: 0.0530 - val_mae: 0.1897
Epoch 14/40
3/3 [==============================] - 0s 58ms/step - loss: 0.0640 - mse: 0.0640 - mae: 0.1997 - val_loss: 0.0385 - val_mse: 0.0385 - val_mae: 0.1571
Epoch 15/40
3/3 [==============================] - 0s 47ms/step - loss: 0.0470 - mse: 0.0470 - mae: 0.1674 - val_loss: 0.0439 - val_mse: 0.0439 - val_mae: 0.1619
Epoch 16/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0499 - mse: 0.0499 - mae: 0.1750 - val_loss: 0.0491 - val_mse: 0.0491 - val_mae: 0.1778
Epoch 17/40
3/3 [==============================] - 0s 52ms/step - loss: 0.0638 - mse: 0.0638 - mae: 0.1926 - val_loss: 0.0398 - val_mse: 0.0398 - val_mae: 0.1613
Epoch 18/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0652 - mse: 0.0652 - mae: 0.1923 - val_loss: 0.0322 - val_mse: 0.0322 - val_mae: 0.1411
Epoch 19/40
3/3 [==============================] - 0s 57ms/step - loss: 0.0420 - mse: 0.0420 - mae: 0.1581 - val_loss: 0.0354 - val_mse: 0.0354 - val_mae: 0.1437
Epoch 20/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0519 - mse: 0.0519 - mae: 0.1740 - val_loss: 0.0377 - val_mse: 0.0377 - val_mae: 0.1469
Epoch 21/40
3/3 [==============================] - 0s 57ms/step - loss: 0.0496 - mse: 0.0496 - mae: 0.1688 - val_loss: 0.0380 - val_mse: 0.0380 - val_mae: 0.1489
Epoch 22/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0452 - mse: 0.0452 - mae: 0.1637 - val_loss: 0.0366 - val_mse: 0.0366 - val_mae: 0.1491
Epoch 23/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0508 - mse: 0.0508 - mae: 0.1720 - val_loss: 0.0406 - val_mse: 0.0406 - val_mae: 0.1534
Epoch 24/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0493 - mse: 0.0493 - mae: 0.1681 - val_loss: 0.0477 - val_mse: 0.0477 - val_mae: 0.1597
Epoch 25/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0461 - mse: 0.0461 - mae: 0.1680 - val_loss: 0.0416 - val_mse: 0.0416 - val_mae: 0.1569
Epoch 26/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0503 - mse: 0.0503 - mae: 0.1728 - val_loss: 0.0438 - val_mse: 0.0438 - val_mae: 0.1581
Epoch 27/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0651 - mse: 0.0651 - mae: 0.1787 - val_loss: 0.0394 - val_mse: 0.0394 - val_mae: 0.1576
Epoch 28/40
3/3 [==============================] - 0s 50ms/step - loss: 0.0557 - mse: 0.0557 - mae: 0.1833 - val_loss: 0.0397 - val_mse: 0.0397 - val_mae: 0.1562
Epoch 29/40
3/3 [==============================] - 0s 63ms/step - loss: 0.0469 - mse: 0.0469 - mae: 0.1637 - val_loss: 0.0443 - val_mse: 0.0443 - val_mae: 0.1593
Epoch 30/40
3/3 [==============================] - 0s 59ms/step - loss: 0.0419 - mse: 0.0419 - mae: 0.1584 - val_loss: 0.0377 - val_mse: 0.0377 - val_mae: 0.1574
Epoch 31/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0513 - mse: 0.0513 - mae: 0.1747 - val_loss: 0.0392 - val_mse: 0.0392 - val_mae: 0.1544
Epoch 32/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0414 - mse: 0.0414 - mae: 0.1523 - val_loss: 0.0548 - val_mse: 0.0548 - val_mae: 0.1641
Epoch 33/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0598 - mse: 0.0598 - mae: 0.1620 - val_loss: 0.0430 - val_mse: 0.0430 - val_mae: 0.1545
Epoch 34/40
3/3 [==============================] - 0s 55ms/step - loss: 0.0423 - mse: 0.0423 - mae: 0.1586 - val_loss: 0.0377 - val_mse: 0.0377 - val_mae: 0.1541
Epoch 35/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0558 - mse: 0.0558 - mae: 0.1727 - val_loss: 0.0396 - val_mse: 0.0396 - val_mae: 0.1559
Epoch 36/40
3/3 [==============================] - 0s 61ms/step - loss: 0.0518 - mse: 0.0518 - mae: 0.1639 - val_loss: 0.0411 - val_mse: 0.0411 - val_mae: 0.1554
Epoch 37/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0511 - mse: 0.0511 - mae: 0.1643 - val_loss: 0.0382 - val_mse: 0.0382 - val_mae: 0.1548
Epoch 38/40
3/3 [==============================] - 0s 52ms/step - loss: 0.0453 - mse: 0.0453 - mae: 0.1530 - val_loss: 0.0372 - val_mse: 0.0372 - val_mae: 0.1545
Epoch 39/40
3/3 [==============================] - 0s 55ms/step - loss: 0.0413 - mse: 0.0413 - mae: 0.1563 - val_loss: 0.0433 - val_mse: 0.0433 - val_mae: 0.1608
Epoch 40/40
3/3 [==============================] - 0s 56ms/step - loss: 0.0496 - mse: 0.0496 - mae: 0.1716 - val_loss: 0.0464 - val_mse: 0.0464 - val_mae: 0.1663
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc354e366b0>
model.evaluate(test_X, test_Y)
2/2 [==============================] - 0s 11ms/step - loss: 0.0464 - mse: 0.0464 - mae: 0.1663
[0.046359434723854065, 0.046359434723854065, 0.1662808209657669]
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]

var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)

r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)

var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)
Variance: 0.938245
R2 Score: 0.938245
Max Error: 0.774743
predicted  = model.predict(test_X)

test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
    temp = df.iloc[j,3]
    test_label[j - len_t] = test_label[j - len_t] * temp + temp
    predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted  Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()
2/2 [==============================] - 0s 18ms/step

Clustering using K-Means

I will cluster the data points using K-Means and add that cluster information to the original data and run Linear Regression and CNN+LSTM to see if it will improve their performance. I need to reduce the dimension of the data using PCA.

df.corr().round(3)
<ipython-input-87-8faa7a8256e3>:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.corr().round(3)
open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
open 1.000 0.999 0.999 0.998 0.986 -0.051 0.983 0.954 -0.221 0.241 -0.043 -0.148 0.009 -0.075 0.058
high 0.999 1.000 0.998 0.999 0.988 -0.040 0.982 0.954 -0.227 0.246 -0.038 -0.156 0.010 -0.074 0.056
low 0.999 0.998 1.000 0.999 0.985 -0.075 0.978 0.947 -0.210 0.238 -0.045 -0.145 0.008 -0.079 0.062
close 0.998 0.999 0.999 1.000 0.988 -0.059 0.978 0.949 -0.218 0.245 -0.041 -0.152 0.009 -0.077 0.059
adjclose 0.986 0.988 0.985 0.988 1.000 -0.042 0.970 0.945 -0.321 0.267 -0.013 -0.192 0.011 -0.072 0.053
volume -0.051 -0.040 -0.075 -0.059 -0.042 1.000 0.018 0.058 -0.057 0.048 -0.005 -0.032 0.014 0.090 -0.092
twoweeks 0.983 0.982 0.978 0.978 0.970 0.018 1.000 0.986 -0.266 0.248 -0.035 -0.160 0.006 -0.077 0.062
month 0.954 0.954 0.947 0.949 0.945 0.058 0.986 1.000 -0.311 0.229 -0.034 -0.146 -0.003 -0.074 0.068
trends -0.221 -0.227 -0.210 -0.218 -0.321 -0.057 -0.266 -0.311 1.000 -0.227 -0.072 0.227 -0.015 0.006 0.008
news_pos 0.241 0.246 0.238 0.245 0.267 0.048 0.248 0.229 -0.227 1.000 -0.151 -0.637 -0.074 0.002 0.066
news_neg -0.043 -0.038 -0.045 -0.041 -0.013 -0.005 -0.035 -0.034 -0.072 -0.151 1.000 -0.666 -0.004 -0.005 0.008
news_neu -0.148 -0.156 -0.145 -0.152 -0.192 -0.032 -0.160 -0.146 0.227 -0.637 -0.666 1.000 0.059 0.003 -0.056
tweet_pos 0.009 0.010 0.008 0.009 0.011 0.014 0.006 -0.003 -0.015 -0.074 -0.004 0.059 1.000 -0.378 -0.581
tweet_neg -0.075 -0.074 -0.079 -0.077 -0.072 0.090 -0.077 -0.074 0.006 0.002 -0.005 0.003 -0.378 1.000 -0.534
tweet_neu 0.058 0.056 0.062 0.059 0.053 -0.092 0.062 0.068 0.008 0.066 0.008 -0.056 -0.581 -0.534 1.000
import pandas as pd
df_final = pd.read_csv('df_final_home_depot_scaled.csv')
df_final.head()
Unnamed: 0 date open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 0 2020-07-12 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 -1.738792 -1.736071 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049
1 1 2020-07-19 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 -1.644438 -1.692888 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837
2 2 2020-07-26 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 -1.464655 -1.600581 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759
3 3 2020-08-02 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 -1.282168 -1.461636 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320
4 4 2020-08-09 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 -1.199974 -1.330150 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638
df_final.columns
Index(['Unnamed: 0', 'date', 'open', 'high', 'low', 'close', 'adjclose',
       'volume', 'twoweeks', 'month', 'trends', 'news_pos', 'news_neg',
       'news_neu', 'tweet_pos', 'tweet_neg', 'tweet_neu'],
      dtype='object')
df_values = df_final[['open', 'high', 'low', 'close', 'adjclose',
       'volume', 'trends', 'news_pos', 'news_neg',
       'news_neu', 'tweet_pos', 'tweet_neg', 'tweet_neu']]
df_values.head()
open high low close adjclose volume trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049
1 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837
2 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759
3 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320
4 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
reduced_df = pd.DataFrame(pca.fit_transform(df_values), columns=(["c1","c2", "c3"]))
reduced_df.head()
c1 c2 c3
0 -4.406186 -1.618010 -2.783632
1 -3.608144 3.040917 -0.156988
2 -2.935629 0.057277 -1.898846
3 -2.966506 0.316674 -0.513015
4 -2.639892 -0.374056 -0.027751
import sklearn
from sklearn.cluster import KMeans
distances = []
k_values = range(2,10)
for cluster in k_values:
    kmeans = KMeans(n_clusters=cluster)
    kmeans.fit(reduced_df)
    distances.append(kmeans.inertia_)
plt.plot(k_values, distances,'bx-')
plt.xlabel('Number of clusters') 
plt.ylabel('Sum of Distances') 
plt.title('Elbow Method For Optimal Number of Clusters')
plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
from yellowbrick.cluster import KElbowVisualizer
auto_elbow = KElbowVisualizer(KMeans(), k=10)
auto_elbow.fit(df_values)
auto_elbow.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
from sklearn.cluster import KMeans
km=KMeans(n_clusters=5)
clusters= km.fit_predict(reduced_df)
pca_clust = reduced_df.copy()
pca_clust['clusters']=clusters
pca_clust.head()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
c1 c2 c3 clusters
0 -4.406186 -1.618010 -2.783632 2
1 -3.608144 3.040917 -0.156988 4
2 -2.935629 0.057277 -1.898846 4
3 -2.966506 0.316674 -0.513015 4
4 -2.639892 -0.374056 -0.027751 2
df_values['cluster'] = clusters
df_values.head()
<ipython-input-95-7dac953dbd69>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_values['cluster'] = clusters
open high low close adjclose volume trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu cluster
0 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049 2
1 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837 4
2 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759 4
3 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320 4
4 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638 2

Applying K-Means and Linear Regression on Historical Data:

df_x = df_values[['open', 'high', 'low', 'close', 'adjclose','volume', 'cluster']]
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
x_train, x_test, y_train, y_test = train_test_split(df_x, df_final['twoweeks'], test_size=0.3, random_state=0)

regression = LinearRegression()
regression.fit(x_train, y_train)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)
regression coefficient [ 2.80045428 -0.37223382 -1.53280191  0.06410424  0.02119985  0.03065633
 -0.00540246]
regression intercept -0.012097080760859524
regression_confidence = regression.score(x_test, y_test)
print("linear regression confidence: ", regression_confidence)
linear regression confidence:  0.9476930471292958
predicted=regression.predict(x_test)
print(x_test.head())
         open      high       low     close  adjclose    volume  cluster
26  -1.222595 -1.220040 -1.189692 -1.194005 -1.314978  0.075531        4
135  0.423421  0.383837  0.416818  0.388613  0.595698 -0.751431        0
63   0.783028  0.758650  0.821157  0.777318  0.707787 -0.779556        1
105 -0.615611 -0.554651 -0.594871 -0.573033 -0.461688 -0.918600        4
24  -1.103859 -1.135303 -1.064246 -1.082658 -1.209359 -0.907303        2
dfr=pd.DataFrame({'Actual_Price':y_test, 'Predicted_Price':predicted}).reset_index()
dfr.drop(columns='index', inplace=True, axis=1)
dfr.head(10)
Actual_Price Predicted_Price
0 -1.159801 -1.281929
1 0.471989 0.406400
2 0.753759 0.675203
3 -0.716651 -0.714101
4 -1.131320 -1.183188
5 -0.671596 -0.561560
6 0.666571 0.697591
7 -0.405871 -0.754095
8 -0.043019 -0.375646
9 0.400488 0.381176
from sklearn import metrics
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(y_test, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, predicted)))
Mean Absolute Error (MAE): 0.1337945393494824
Mean Squared Error (MSE) : 0.031248325149281247
Root Mean Squared Error (RMSE): 0.17677195803995963
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Predictions using  Historical")
plt.legend()
<matplotlib.legend.Legend at 0x7fc34d99c520>
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
x_train, x_test, y_train, y_test = train_test_split(df_values, df_final['twoweeks'], test_size=0.3, random_state=0)

regression = LinearRegression()
regression.fit(x_train, y_train)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)
regression coefficient [ 2.66166755e+00 -3.53252621e-01 -1.39810203e+00  1.48573349e-01
 -8.49023102e-02  3.15909420e-02 -2.99197954e-02 -1.89951390e-03
 -6.61002963e-03  6.59002912e-03 -1.43476246e-03  8.87248181e-04
  5.30348639e-04 -2.67182404e-03]
regression intercept -0.018002014019515143
regression_confidence = regression.score(x_test, y_test)
print("linear regression confidence: ", regression_confidence)
linear regression confidence:  0.9493158696671368
predicted=regression.predict(x_test)
print(x_test.head())
         open      high       low     close  adjclose    volume    trends  \
26  -1.222595 -1.220040 -1.189692 -1.194005 -1.314978  0.075531  0.097369   
135  0.423421  0.383837  0.416818  0.388613  0.595698 -0.751431 -1.812569   
63   0.783028  0.758650  0.821157  0.777318  0.707787 -0.779556 -0.571109   
105 -0.615611 -0.554651 -0.594871 -0.573033 -0.461688 -0.918600 -0.189121   
24  -1.103859 -1.135303 -1.064246 -1.082658 -1.209359 -0.907303  0.192866   

     news_pos  news_neg  news_neu  tweet_pos  tweet_neg  tweet_neu  cluster  
26   0.512664  0.372447 -0.677374  -1.503002   0.366194   1.050759        4  
135 -1.106820  3.818192 -2.143669   2.117169  -1.044107  -1.015691        0  
63   0.188767  0.059197 -0.188609   2.569690  -1.044107  -1.428981        1  
105  1.052492  0.894529 -1.491982   0.126075   0.084134  -0.189111        4  
24  -1.106820 -1.193801  1.766450   0.824251   0.567665  -1.251857        2  
dfr=pd.DataFrame({'Actual_Price':y_test, 'Predicted_Price':predicted}).reset_index()
dfr.drop(columns='index', inplace=True, axis=1)
dfr.head(10)
Actual_Price Predicted_Price
0 -1.159801 -1.259679
1 0.471989 0.386546
2 0.753759 0.687921
3 -0.716651 -0.726870
4 -1.131320 -1.144796
5 -0.671596 -0.554696
6 0.666571 0.657503
7 -0.405871 -0.765469
8 -0.043019 -0.374303
9 0.400488 0.389789
from sklearn import metrics
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(y_test, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, predicted)))
Mean Absolute Error (MAE): 0.12894628403303512
Mean Squared Error (MSE) : 0.03027884626475542
Root Mean Squared Error (RMSE): 0.1740081787294937
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Predictions using  Historical, Sentiment and Trends")
plt.legend()
<matplotlib.legend.Legend at 0x7fc34cef49a0>

Applying K-Means and CNN+LSTM on Historical data:

df_x.head()
open high low close adjclose volume cluster
0 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 2
1 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 4
2 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 4
3 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 4
4 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 2
X = [[df_x.iloc[j, i] for i in range(df_x.shape[1])] for j in range(df_x.shape[0])]
Y = [df_final.iloc[i,8] for i in range(df_final.shape[0])] 
print(X[0:2])
print(Y[0:2])
[[-1.7038183261524975, -1.7302608232721683, -1.6829499939918715, -1.7057639670236395, -1.872820057485404, -0.8403922760048991, 2], [-1.5057895100199, -1.470010607913253, -1.4909599654415322, -1.4778224513157197, -1.6589052190920777, -0.12416025602251, 4]]
[-1.738791872212694, -1.6444381302921762]
from sklearn.model_selection import train_test_split
import numpy as np
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)

train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)

train_X = train_X.reshape(train_X.shape[0],1,7,1)
test_X = test_X.reshape(test_X.shape[0],1,7,1)

print(len(train_X))
print(len(test_X))
107
46
# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError

model = tf.keras.Sequential()

# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None,7, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
#model.add(Dense(5, kernel_regularizer=L2(0.01)))

# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))

#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)
Epoch 1/40
3/3 [==============================] - 17s 2s/step - loss: 0.8435 - mse: 0.8435 - mae: 0.7191 - val_loss: 1.2994 - val_mse: 1.2994 - val_mae: 0.8909
Epoch 2/40
3/3 [==============================] - 0s 41ms/step - loss: 0.7868 - mse: 0.7868 - mae: 0.6972 - val_loss: 1.1748 - val_mse: 1.1748 - val_mae: 0.8485
Epoch 3/40
3/3 [==============================] - 0s 47ms/step - loss: 0.6732 - mse: 0.6732 - mae: 0.6493 - val_loss: 0.8909 - val_mse: 0.8909 - val_mae: 0.7464
Epoch 4/40
3/3 [==============================] - 0s 42ms/step - loss: 0.4508 - mse: 0.4508 - mae: 0.5399 - val_loss: 0.4124 - val_mse: 0.4124 - val_mae: 0.5224
Epoch 5/40
3/3 [==============================] - 0s 49ms/step - loss: 0.1894 - mse: 0.1894 - mae: 0.3295 - val_loss: 0.0471 - val_mse: 0.0471 - val_mae: 0.1709
Epoch 6/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0823 - mse: 0.0823 - mae: 0.2286 - val_loss: 0.1999 - val_mse: 0.1999 - val_mae: 0.3940
Epoch 7/40
3/3 [==============================] - 0s 41ms/step - loss: 0.1960 - mse: 0.1960 - mae: 0.3494 - val_loss: 0.0906 - val_mse: 0.0906 - val_mae: 0.2370
Epoch 8/40
3/3 [==============================] - 0s 47ms/step - loss: 0.0836 - mse: 0.0836 - mae: 0.2222 - val_loss: 0.0874 - val_mse: 0.0874 - val_mae: 0.2231
Epoch 9/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0851 - mse: 0.0851 - mae: 0.2084 - val_loss: 0.1338 - val_mse: 0.1338 - val_mae: 0.2902
Epoch 10/40
3/3 [==============================] - 0s 46ms/step - loss: 0.0925 - mse: 0.0925 - mae: 0.2278 - val_loss: 0.0918 - val_mse: 0.0918 - val_mae: 0.2423
Epoch 11/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0644 - mse: 0.0644 - mae: 0.1962 - val_loss: 0.0461 - val_mse: 0.0461 - val_mae: 0.1661
Epoch 12/40
3/3 [==============================] - 0s 68ms/step - loss: 0.0565 - mse: 0.0565 - mae: 0.1744 - val_loss: 0.0339 - val_mse: 0.0339 - val_mae: 0.1346
Epoch 13/40
3/3 [==============================] - 0s 63ms/step - loss: 0.0756 - mse: 0.0756 - mae: 0.1995 - val_loss: 0.0467 - val_mse: 0.0467 - val_mae: 0.1664
Epoch 14/40
3/3 [==============================] - 0s 65ms/step - loss: 0.0591 - mse: 0.0591 - mae: 0.1964 - val_loss: 0.0582 - val_mse: 0.0582 - val_mae: 0.1929
Epoch 15/40
3/3 [==============================] - 0s 75ms/step - loss: 0.0614 - mse: 0.0614 - mae: 0.1892 - val_loss: 0.0528 - val_mse: 0.0528 - val_mae: 0.1754
Epoch 16/40
3/3 [==============================] - 0s 69ms/step - loss: 0.0429 - mse: 0.0429 - mae: 0.1579 - val_loss: 0.0513 - val_mse: 0.0513 - val_mae: 0.1761
Epoch 17/40
3/3 [==============================] - 0s 76ms/step - loss: 0.0553 - mse: 0.0553 - mae: 0.1851 - val_loss: 0.0463 - val_mse: 0.0463 - val_mae: 0.1687
Epoch 18/40
3/3 [==============================] - 0s 75ms/step - loss: 0.0602 - mse: 0.0602 - mae: 0.1794 - val_loss: 0.0393 - val_mse: 0.0393 - val_mae: 0.1505
Epoch 19/40
3/3 [==============================] - 0s 71ms/step - loss: 0.0561 - mse: 0.0561 - mae: 0.1799 - val_loss: 0.0444 - val_mse: 0.0444 - val_mae: 0.1655
Epoch 20/40
3/3 [==============================] - 0s 66ms/step - loss: 0.0568 - mse: 0.0568 - mae: 0.1719 - val_loss: 0.0313 - val_mse: 0.0313 - val_mae: 0.1319
Epoch 21/40
3/3 [==============================] - 0s 67ms/step - loss: 0.0393 - mse: 0.0393 - mae: 0.1526 - val_loss: 0.0319 - val_mse: 0.0319 - val_mae: 0.1335
Epoch 22/40
3/3 [==============================] - 0s 64ms/step - loss: 0.0532 - mse: 0.0532 - mae: 0.1732 - val_loss: 0.0297 - val_mse: 0.0297 - val_mae: 0.1321
Epoch 23/40
3/3 [==============================] - 0s 66ms/step - loss: 0.0488 - mse: 0.0488 - mae: 0.1769 - val_loss: 0.0392 - val_mse: 0.0392 - val_mae: 0.1575
Epoch 24/40
3/3 [==============================] - 0s 70ms/step - loss: 0.0447 - mse: 0.0447 - mae: 0.1678 - val_loss: 0.0433 - val_mse: 0.0433 - val_mae: 0.1657
Epoch 25/40
3/3 [==============================] - 0s 68ms/step - loss: 0.0409 - mse: 0.0409 - mae: 0.1538 - val_loss: 0.0338 - val_mse: 0.0338 - val_mae: 0.1421
Epoch 26/40
3/3 [==============================] - 0s 71ms/step - loss: 0.0364 - mse: 0.0364 - mae: 0.1565 - val_loss: 0.0295 - val_mse: 0.0295 - val_mae: 0.1335
Epoch 27/40
3/3 [==============================] - 0s 42ms/step - loss: 0.0438 - mse: 0.0438 - mae: 0.1651 - val_loss: 0.0310 - val_mse: 0.0310 - val_mae: 0.1321
Epoch 28/40
3/3 [==============================] - 0s 41ms/step - loss: 0.0585 - mse: 0.0585 - mae: 0.1756 - val_loss: 0.0451 - val_mse: 0.0451 - val_mae: 0.1623
Epoch 29/40
3/3 [==============================] - 0s 40ms/step - loss: 0.0581 - mse: 0.0581 - mae: 0.1712 - val_loss: 0.0460 - val_mse: 0.0460 - val_mae: 0.1629
Epoch 30/40
3/3 [==============================] - 0s 52ms/step - loss: 0.0503 - mse: 0.0503 - mae: 0.1695 - val_loss: 0.0344 - val_mse: 0.0344 - val_mae: 0.1425
Epoch 31/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0552 - mse: 0.0552 - mae: 0.1676 - val_loss: 0.0413 - val_mse: 0.0413 - val_mae: 0.1576
Epoch 32/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0430 - mse: 0.0430 - mae: 0.1617 - val_loss: 0.0427 - val_mse: 0.0427 - val_mae: 0.1586
Epoch 33/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0445 - mse: 0.0445 - mae: 0.1510 - val_loss: 0.0305 - val_mse: 0.0305 - val_mae: 0.1326
Epoch 34/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0552 - mse: 0.0552 - mae: 0.1731 - val_loss: 0.0424 - val_mse: 0.0424 - val_mae: 0.1598
Epoch 35/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0584 - mse: 0.0584 - mae: 0.1776 - val_loss: 0.0609 - val_mse: 0.0609 - val_mae: 0.1907
Epoch 36/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0492 - mse: 0.0492 - mae: 0.1695 - val_loss: 0.0427 - val_mse: 0.0427 - val_mae: 0.1609
Epoch 37/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0339 - mse: 0.0339 - mae: 0.1425 - val_loss: 0.0296 - val_mse: 0.0296 - val_mae: 0.1355
Epoch 38/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0438 - mse: 0.0438 - mae: 0.1539 - val_loss: 0.0316 - val_mse: 0.0316 - val_mae: 0.1407
Epoch 39/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0487 - mse: 0.0487 - mae: 0.1662 - val_loss: 0.0441 - val_mse: 0.0441 - val_mae: 0.1637
Epoch 40/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0563 - mse: 0.0563 - mae: 0.1701 - val_loss: 0.0616 - val_mse: 0.0616 - val_mae: 0.1905
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc354999cf0>
model.evaluate(test_X, test_Y)
2/2 [==============================] - 0s 12ms/step - loss: 0.0616 - mse: 0.0616 - mae: 0.1905
[0.061608511954545975, 0.061608511954545975, 0.1905282735824585]
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]

var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)

r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)

var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)
Variance: 0.964044
R2 Score: 0.964044
Max Error: 0.661186
predicted  = model.predict(test_X)

test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
    temp = df_final.iloc[j,3]
    test_label[j - len_t] = test_label[j - len_t] * temp + temp
    predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted  Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()
2/2 [==============================] - 0s 9ms/step
X = [[df_values.iloc[j, i] for i in range(df_values.shape[1])] for j in range(df_values.shape[0])]
Y = [df_final.iloc[i,8] for i in range(df_final.shape[0])] 
print(X[0:2])
print(Y[0:2])
[[-1.7038183261524975, -1.7302608232721683, -1.6829499939918715, -1.7057639670236395, -1.872820057485404, -0.8403922760048991, 2.580289089276951, -1.1068195873526132, -1.1938008487946932, 1.7664503220919647, -1.5030018624207688, -0.1039065146895656, 1.464048694140032, 2], [-1.5057895100199, -1.470010607913253, -1.4909599654415322, -1.4778224513157197, -1.6589052190920777, -0.12416025602251, 2.198301444710552, -1.1068195873526132, 3.504942032500697, -1.8992863327148637, -1.5030018624207688, 1.2123738405060045, 0.3068368015117597, 4]]
[-1.738791872212694, -1.6444381302921762]
from sklearn.model_selection import train_test_split
import numpy as np
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)

train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)

train_X = train_X.reshape(train_X.shape[0],1,14,1)
test_X = test_X.reshape(test_X.shape[0],1,14,1)

print(len(train_X))
print(len(test_X))
107
46
# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError

model = tf.keras.Sequential()

# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None,14, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
#model.add(Dense(5, kernel_regularizer=L2(0.01)))

# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))

#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)
Epoch 1/40
3/3 [==============================] - 13s 893ms/step - loss: 0.9283 - mse: 0.9283 - mae: 0.7439 - val_loss: 1.0816 - val_mse: 1.0816 - val_mae: 0.8186
Epoch 2/40
3/3 [==============================] - 0s 47ms/step - loss: 0.8707 - mse: 0.8707 - mae: 0.7204 - val_loss: 0.9586 - val_mse: 0.9586 - val_mae: 0.7668
Epoch 3/40
3/3 [==============================] - 0s 50ms/step - loss: 0.7502 - mse: 0.7502 - mae: 0.6681 - val_loss: 0.6875 - val_mse: 0.6875 - val_mae: 0.6335
Epoch 4/40
3/3 [==============================] - 0s 52ms/step - loss: 0.4556 - mse: 0.4556 - mae: 0.5155 - val_loss: 0.2252 - val_mse: 0.2252 - val_mae: 0.3242
Epoch 5/40
3/3 [==============================] - 0s 50ms/step - loss: 0.1291 - mse: 0.1291 - mae: 0.2721 - val_loss: 0.1801 - val_mse: 0.1801 - val_mae: 0.3321
Epoch 6/40
3/3 [==============================] - 0s 51ms/step - loss: 0.2382 - mse: 0.2382 - mae: 0.3739 - val_loss: 0.2083 - val_mse: 0.2083 - val_mae: 0.3451
Epoch 7/40
3/3 [==============================] - 0s 51ms/step - loss: 0.1393 - mse: 0.1393 - mae: 0.2724 - val_loss: 0.0689 - val_mse: 0.0689 - val_mae: 0.1855
Epoch 8/40
3/3 [==============================] - 0s 43ms/step - loss: 0.0738 - mse: 0.0738 - mae: 0.2105 - val_loss: 0.1492 - val_mse: 0.1492 - val_mae: 0.2463
Epoch 9/40
3/3 [==============================] - 0s 47ms/step - loss: 0.1093 - mse: 0.1093 - mae: 0.2484 - val_loss: 0.1612 - val_mse: 0.1612 - val_mae: 0.2658
Epoch 10/40
3/3 [==============================] - 0s 53ms/step - loss: 0.1245 - mse: 0.1245 - mae: 0.2644 - val_loss: 0.0971 - val_mse: 0.0971 - val_mae: 0.2057
Epoch 11/40
3/3 [==============================] - 0s 56ms/step - loss: 0.0750 - mse: 0.0750 - mae: 0.1969 - val_loss: 0.0586 - val_mse: 0.0586 - val_mae: 0.1809
Epoch 12/40
3/3 [==============================] - 0s 52ms/step - loss: 0.0657 - mse: 0.0657 - mae: 0.1980 - val_loss: 0.0731 - val_mse: 0.0731 - val_mae: 0.2134
Epoch 13/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0723 - mse: 0.0723 - mae: 0.2060 - val_loss: 0.0743 - val_mse: 0.0743 - val_mae: 0.2120
Epoch 14/40
3/3 [==============================] - 0s 50ms/step - loss: 0.0556 - mse: 0.0556 - mae: 0.1859 - val_loss: 0.0583 - val_mse: 0.0583 - val_mae: 0.1829
Epoch 15/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0518 - mse: 0.0518 - mae: 0.1744 - val_loss: 0.0575 - val_mse: 0.0575 - val_mae: 0.1691
Epoch 16/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0541 - mse: 0.0541 - mae: 0.1732 - val_loss: 0.0638 - val_mse: 0.0638 - val_mae: 0.1766
Epoch 17/40
3/3 [==============================] - 0s 48ms/step - loss: 0.0530 - mse: 0.0530 - mae: 0.1773 - val_loss: 0.0641 - val_mse: 0.0641 - val_mae: 0.1815
Epoch 18/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0559 - mse: 0.0559 - mae: 0.1679 - val_loss: 0.0602 - val_mse: 0.0602 - val_mae: 0.1766
Epoch 19/40
3/3 [==============================] - 0s 47ms/step - loss: 0.0456 - mse: 0.0456 - mae: 0.1608 - val_loss: 0.0615 - val_mse: 0.0615 - val_mae: 0.1804
Epoch 20/40
3/3 [==============================] - 0s 48ms/step - loss: 0.0413 - mse: 0.0413 - mae: 0.1605 - val_loss: 0.0617 - val_mse: 0.0617 - val_mae: 0.1815
Epoch 21/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0497 - mse: 0.0497 - mae: 0.1733 - val_loss: 0.0627 - val_mse: 0.0627 - val_mae: 0.1825
Epoch 22/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0401 - mse: 0.0401 - mae: 0.1543 - val_loss: 0.0651 - val_mse: 0.0651 - val_mae: 0.1765
Epoch 23/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0520 - mse: 0.0520 - mae: 0.1778 - val_loss: 0.0619 - val_mse: 0.0619 - val_mae: 0.1746
Epoch 24/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0423 - mse: 0.0423 - mae: 0.1565 - val_loss: 0.0629 - val_mse: 0.0629 - val_mae: 0.1894
Epoch 25/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0293 - mse: 0.0293 - mae: 0.1376 - val_loss: 0.0655 - val_mse: 0.0655 - val_mae: 0.1943
Epoch 26/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0472 - mse: 0.0472 - mae: 0.1656 - val_loss: 0.0562 - val_mse: 0.0562 - val_mae: 0.1722
Epoch 27/40
3/3 [==============================] - 0s 55ms/step - loss: 0.0426 - mse: 0.0426 - mae: 0.1676 - val_loss: 0.0572 - val_mse: 0.0572 - val_mae: 0.1703
Epoch 28/40
3/3 [==============================] - 0s 48ms/step - loss: 0.0424 - mse: 0.0424 - mae: 0.1590 - val_loss: 0.0587 - val_mse: 0.0587 - val_mae: 0.1770
Epoch 29/40
3/3 [==============================] - 0s 48ms/step - loss: 0.0371 - mse: 0.0371 - mae: 0.1488 - val_loss: 0.0595 - val_mse: 0.0595 - val_mae: 0.1810
Epoch 30/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0390 - mse: 0.0390 - mae: 0.1529 - val_loss: 0.0626 - val_mse: 0.0626 - val_mae: 0.1866
Epoch 31/40
3/3 [==============================] - 0s 55ms/step - loss: 0.0418 - mse: 0.0418 - mae: 0.1524 - val_loss: 0.0601 - val_mse: 0.0601 - val_mae: 0.1780
Epoch 32/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0474 - mse: 0.0474 - mae: 0.1659 - val_loss: 0.0569 - val_mse: 0.0569 - val_mae: 0.1792
Epoch 33/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0389 - mse: 0.0389 - mae: 0.1597 - val_loss: 0.0588 - val_mse: 0.0588 - val_mae: 0.1899
Epoch 34/40
3/3 [==============================] - 0s 50ms/step - loss: 0.0407 - mse: 0.0407 - mae: 0.1512 - val_loss: 0.0592 - val_mse: 0.0592 - val_mae: 0.1928
Epoch 35/40
3/3 [==============================] - 0s 46ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1503 - val_loss: 0.0587 - val_mse: 0.0587 - val_mae: 0.1883
Epoch 36/40
3/3 [==============================] - 0s 42ms/step - loss: 0.0445 - mse: 0.0445 - mae: 0.1519 - val_loss: 0.0603 - val_mse: 0.0603 - val_mae: 0.1886
Epoch 37/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0281 - mse: 0.0281 - mae: 0.1283 - val_loss: 0.0613 - val_mse: 0.0613 - val_mae: 0.1905
Epoch 38/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0380 - mse: 0.0380 - mae: 0.1487 - val_loss: 0.0623 - val_mse: 0.0623 - val_mae: 0.1884
Epoch 39/40
3/3 [==============================] - 0s 43ms/step - loss: 0.0316 - mse: 0.0316 - mae: 0.1320 - val_loss: 0.0640 - val_mse: 0.0640 - val_mae: 0.1942
Epoch 40/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0405 - mse: 0.0405 - mae: 0.1428 - val_loss: 0.0701 - val_mse: 0.0701 - val_mae: 0.1984
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc34d0a2080>
model.evaluate(test_X, test_Y)
2/2 [==============================] - 0s 13ms/step - loss: 0.0701 - mse: 0.0701 - mae: 0.1984
[0.07009752839803696, 0.07009752839803696, 0.19842290878295898]
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]

var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)

r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)

var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)
Variance: 0.943367
R2 Score: 0.943367
Max Error: 0.796149
predicted  = model.predict(test_X)

test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
    temp = df_final.iloc[j,3]
    test_label[j - len_t] = test_label[j - len_t] * temp + temp
    predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted  Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()
2/2 [==============================] - 0s 12ms/step

Results:

results.png

Made with REPL Notes Build your own website in minutes with Jupyter notebooks.