Predicting Stock Prices of Home Depot Based on Trends and the Sentiment of News and Tweets

Betul Mescioglu
import pandas as pd
import seaborn
import matplotlib.pyplot as plt
import datetime as dt
from datetime import date
from dateutil.relativedelta import relativedelta
!pip install yahoo_fin

Stock Data of Home Depot:

from yahoo_fin import stock_info as si

def stock_data(stock_name):
    df = si.get_data(stock_name, start_date=(date.today() - relativedelta(years=3)).strftime('%m/%d/%Y'), 
                     end_date=date.today().strftime('%m/%d/%Y'))
    return df
ticker='HD'
company_name='Home Depot'
df = stock_data(ticker)
df.head()
open high low close adjclose volume ticker
2020-06-08 252.490005 256.809998 252.259995 256.769989 239.264954 3811900 HD
2020-06-09 255.330002 258.290009 253.860001 256.760010 239.255646 3716100 HD
2020-06-10 257.450012 259.290009 254.220001 254.449997 237.103104 3589800 HD
2020-06-11 248.860001 250.619995 238.740005 239.470001 223.144394 6563700 HD
2020-06-12 243.070007 246.389999 237.050003 242.449997 225.921204 5238500 HD
df.tail()
open high low close adjclose volume ticker
2023-05-31 289.589996 290.000000 281.959991 283.450012 283.450012 18288800 HD
2023-06-01 284.049988 289.220001 279.980011 288.390015 288.390015 4305100 HD
2023-06-02 290.649994 296.209991 289.720001 295.940002 295.940002 4514600 HD
2023-06-05 295.619995 295.720001 291.369995 293.100006 293.100006 3028800 HD
2023-06-06 291.820007 296.920013 291.649994 296.000000 296.000000 2854800 HD
#Add target values for two weeks and one month. 
#Dataset has values for weekdays only, so two weeks will be 10 data points
# and one month will be 20 data points
df['TwoWeeks'] = df['close'].rolling(10).mean()
df['Month'] = df['close'].rolling(20).mean()
df.head()
open high low close adjclose volume ticker TwoWeeks Month
2020-06-08 252.490005 256.809998 252.259995 256.769989 239.264938 3811900 HD NaN NaN
2020-06-09 255.330002 258.290009 253.860001 256.760010 239.255646 3716100 HD NaN NaN
2020-06-10 257.450012 259.290009 254.220001 254.449997 237.103119 3589800 HD NaN NaN
2020-06-11 248.860001 250.619995 238.740005 239.470001 223.144379 6563700 HD NaN NaN
2020-06-12 243.070007 246.389999 237.050003 242.449997 225.921188 5238500 HD NaN NaN
df.dropna(inplace=True)
df.head()
open high low close adjclose volume ticker TwoWeeks Month
2020-07-06 250.270004 251.500000 247.039993 249.550003 232.537186 3133800 HD 247.481999 248.1370
2020-07-07 247.369995 250.779999 247.070007 247.350006 230.487167 2927800 HD 247.300999 247.6660
2020-07-08 247.869995 249.789993 246.220001 249.169998 232.183075 2294000 HD 247.187000 247.2865
2020-07-09 249.660004 250.509995 246.350006 247.960007 231.055588 2994700 HD 247.370000 246.9620
2020-07-10 248.289993 250.330002 246.639999 250.110001 233.058990 2745300 HD 247.842999 247.4940
!pip install pytrends
import pytrends
from pytrends.request import TrendReq
pytrends = TrendReq()
kw_list=['Home Depot']
#Gather Google trends for ticker 'HD' and Home Depot
pytrends.build_payload(kw_list, geo='', timeframe='{} {}'.format((date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'),date.today().strftime('%Y-%m-%d')))#timeframe='2010-06-29 2023-05-16')
keyword_interest = pytrends.interest_over_time()
del keyword_interest['isPartial']
keyword_interest.columns.rename("{}".format(ticker))
keyword_interest.head()
Home Depot
date
2020-06-07 100
2020-06-14 100
2020-06-21 96
2020-06-28 97
2020-07-05 90
keyword_interest['Home Depot'].plot(title='Google Trends for \'Home Depot\' in the last three years')
<AxesSubplot: title={'center': "Google Trends for 'Home Depot' in the last three years"}>
df_combined = pd.concat([df, keyword_interest], axis=1)
#df_combined.dropna(subset='Month', inplace=True)
df_combined.head(20)
open high low close adjclose volume ticker TwoWeeks Month Home Depot
2020-06-07 NaN NaN NaN NaN NaN NaN NaN NaN NaN 100.0
2020-06-14 NaN NaN NaN NaN NaN NaN NaN NaN NaN 100.0
2020-06-21 NaN NaN NaN NaN NaN NaN NaN NaN NaN 96.0
2020-06-28 NaN NaN NaN NaN NaN NaN NaN NaN NaN 97.0
2020-07-05 NaN NaN NaN NaN NaN NaN NaN NaN NaN 90.0
2020-07-06 250.270004 251.500000 247.039993 249.550003 232.537186 3133800.0 HD 247.481999 248.137000 NaN
2020-07-07 247.369995 250.779999 247.070007 247.350006 230.487167 2927800.0 HD 247.300999 247.666000 NaN
2020-07-08 247.869995 249.789993 246.220001 249.169998 232.183075 2294000.0 HD 247.187000 247.286500 NaN
2020-07-09 249.660004 250.509995 246.350006 247.960007 231.055588 2994700.0 HD 247.370000 246.962000 NaN
2020-07-10 248.289993 250.330002 246.639999 250.110001 233.058990 2745300.0 HD 247.842999 247.494000 NaN
2020-07-12 NaN NaN NaN NaN NaN NaN NaN NaN NaN 86.0
2020-07-13 251.919998 257.859985 249.089996 249.619995 232.602386 4437500.0 HD 248.703999 247.852500 NaN
2020-07-14 249.000000 258.179993 248.460007 257.790009 240.215408 4614200.0 HD 249.871001 248.674001 NaN
2020-07-15 260.140015 261.290009 255.149994 257.799988 240.224716 4343700.0 HD 250.600000 249.066500 NaN
2020-07-16 256.760010 260.500000 256.000000 258.079987 240.485641 2511200.0 HD 251.592999 249.427999 NaN
2020-07-17 260.029999 260.649994 257.720001 260.380005 242.628845 3091300.0 HD 252.781000 249.986499 NaN
2020-07-19 NaN NaN NaN NaN NaN NaN NaN NaN NaN 85.0
2020-07-20 259.040009 261.200012 258.019989 260.170013 242.433228 2451100.0 HD 253.843001 250.662500 NaN
2020-07-21 261.619995 263.869995 260.720001 262.420013 244.529800 2439900.0 HD 255.350002 251.325500 NaN
2020-07-22 262.589996 265.589996 262.000000 265.170013 247.092316 2750000.0 HD 256.950003 252.068501 NaN

Google Trends give values for Sundays only when the data is pulled in years. I will move Sunday values to the week after since we are concerned with trends rather than daily values.

df_combined['Home Depot'] = df_combined['Home Depot'].fillna(method='ffill')
df_combined.dropna(inplace=True)
df_combined.head(30)
open high low close adjclose volume ticker TwoWeeks Month Home Depot
2020-07-06 250.270004 251.500000 247.039993 249.550003 232.537186 3133800.0 HD 247.481999 248.137000 90.0
2020-07-07 247.369995 250.779999 247.070007 247.350006 230.487167 2927800.0 HD 247.300999 247.666000 90.0
2020-07-08 247.869995 249.789993 246.220001 249.169998 232.183075 2294000.0 HD 247.187000 247.286500 90.0
2020-07-09 249.660004 250.509995 246.350006 247.960007 231.055588 2994700.0 HD 247.370000 246.962000 90.0
2020-07-10 248.289993 250.330002 246.639999 250.110001 233.058990 2745300.0 HD 247.842999 247.494000 90.0
2020-07-13 251.919998 257.859985 249.089996 249.619995 232.602386 4437500.0 HD 248.703999 247.852500 86.0
2020-07-14 249.000000 258.179993 248.460007 257.790009 240.215408 4614200.0 HD 249.871001 248.674001 86.0
2020-07-15 260.140015 261.290009 255.149994 257.799988 240.224716 4343700.0 HD 250.600000 249.066500 86.0
2020-07-16 256.760010 260.500000 256.000000 258.079987 240.485641 2511200.0 HD 251.592999 249.427999 86.0
2020-07-17 260.029999 260.649994 257.720001 260.380005 242.628845 3091300.0 HD 252.781000 249.986499 86.0
2020-07-20 259.040009 261.200012 258.019989 260.170013 242.433228 2451100.0 HD 253.843001 250.662500 85.0
2020-07-21 261.619995 263.869995 260.720001 262.420013 244.529800 2439900.0 HD 255.350002 251.325500 85.0
2020-07-22 262.589996 265.589996 262.000000 265.170013 247.092316 2750000.0 HD 256.950003 252.068501 85.0
2020-07-23 267.799988 267.799988 261.799988 263.809998 245.824997 2680100.0 HD 258.535002 252.952501 85.0
2020-07-24 265.040009 266.890015 262.989990 265.309998 247.222717 2984500.0 HD 260.055002 253.949001 85.0
2020-07-27 265.089996 268.679993 265.089996 267.420013 249.188919 2412500.0 HD 261.835004 255.269501 85.0
2020-07-28 268.559998 269.070007 264.670013 265.279999 247.194794 2227000.0 HD 262.584003 256.227502 85.0
2020-07-29 264.799988 267.109985 264.170013 264.660004 246.617065 2874100.0 HD 263.270004 256.935002 85.0
2020-07-30 263.339996 267.350006 261.549988 266.309998 248.154556 2347800.0 HD 264.093005 257.843002 85.0
2020-07-31 265.000000 267.170013 260.609985 265.489990 247.390457 3640600.0 HD 264.604004 258.692502 85.0
2020-08-03 266.730011 268.579987 265.670013 266.179993 248.033463 2363500.0 HD 265.205002 259.524001 80.0
2020-08-04 266.630005 267.890015 263.839996 267.869995 249.608231 2224000.0 HD 265.750000 260.550001 80.0
2020-08-05 268.390015 268.390015 265.890015 267.480011 249.244797 1959900.0 HD 265.981000 261.465501 80.0
2020-08-06 266.600006 270.440002 266.529999 269.369995 251.005936 2203400.0 HD 266.537000 262.536001 80.0
2020-08-07 270.609985 274.920013 269.809998 271.640015 253.121140 2846300.0 HD 267.170001 263.612502 80.0
2020-08-10 272.420013 275.000000 271.799988 274.730011 256.000580 2393100.0 HD 267.901001 264.868002 79.0
2020-08-11 277.690002 279.369995 274.410004 274.920013 256.177612 3321300.0 HD 268.865002 265.724503 79.0
2020-08-12 279.750000 282.970001 276.959991 281.579987 262.383514 3867900.0 HD 270.557001 266.913503 79.0
2020-08-13 281.160004 282.649994 279.739990 281.660004 262.458069 2202400.0 HD 272.092001 268.092503 79.0
2020-08-14 281.140015 282.000000 279.190002 280.549988 261.423767 2490400.0 HD 273.598001 269.101003 79.0

Home Depot in Twitter:

!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
kw_list
['HD', 'Home Depot']

# The following code collects data related to 'HD' and Home Depot from 
# "markets" and "MarketWatch" twitter accounts published in the last three years

import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime as dt
from datetime import date

# Creating list to append tweet data to
attributes_container = []

sources_all =['CNBC','cnn', 'cnnbrk','MarketWatch', 'Benzinga', 'Stocktwits','BreakoutStocks', 
          'bespokeinvest','WSJMarkets','Stephanie_Link','nytimesbusiness','IBDinvestors','WSJDealJournal',
         'business', 'TheEconomist','WSJ', 'ABC', 'CBSNews','FoxNews', 'NBCNews']

sources=['markets','MarketWatch']
for tweet in sntwitter.TwitterSearchScraper('from:{} since:{} until:{}'.format(sources[0], (date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'), date.today().strftime('%Y-%m-%d'))).get_items():
    if 'Home Depot'.lower() in tweet.content.lower():
        attributes_container.append([tweet.date, tweet.content.split('http')[0]])
tweets_df = pd.DataFrame(attributes_container, columns=["Date Created", "Tweets"])
tweets_df.drop_duplicates(subset='Tweets',inplace=True)

attributes_container = []
for tweet in sntwitter.TwitterSearchScraper('from:{} since:{} until:{}'.format(sources[1], (date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'), date.today().strftime('%Y-%m-%d'))).get_items():
    if 'Home Depot'.lower() in tweet.content.lower():
        attributes_container.append([tweet.date, tweet.content.split('http')[0]])
# Creating a dataframe from the tweets list above 
tweets_df_marketwatch = pd.DataFrame(attributes_container, columns=["Date Created", "Tweets"])
print(tweets_df_marketwatch.shape)
tweets_df_marketwatch.drop_duplicates(subset='Tweets',inplace=True)
pd.set_option("max_colwidth", None)
tweets_df.head()
Date Created Tweets
0 2023-03-01 18:05:43+00:00 From Meta to Home Depot, corporate America is talking about AI on earnings calls
1 2023-02-21 19:06:41+00:00 Home Depot, Walmart and DocuSign.\n\n@RitikaGuptaTV has your stocks to watch this Tuesday
2 2023-02-21 15:15:07+00:00 Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers
3 2022-11-15 18:11:03+00:00 Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned
4 2022-08-16 15:09:26+00:00 Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off
tweets_df_marketwatch.shape
(1159, 3)

Home Depot in Google News:

!pip install pygooglenews 
kw_list
['Home Depot']

import pygooglenews
from pygooglenews import GoogleNews
gn = GoogleNews()
headlines_related = []

date_list = pd.date_range(end=date.today().strftime('%Y-%m-%d'), start=(date.today()-relativedelta(years=3)).strftime('%Y-%m-%d')).tolist()
clean_date = [str(i).split(" ")[0] for i in date_list]
for date in clean_date:
    headlines = []
    for word in kw_list:
        search =gn.search(word,when=date)
        for item in search['entries']:
            headlines.append(item['title'])
    #We have headlines of the news whose body may contain one of the keywords.
    #We will use only headlines to make predictions but some headlines do not contain any of the keywords.
    #Only include headlines that has one of the keywords
    for headline in headlines:
        for i in range(len(kw_list)):
            if kw_list[i] in headline:
                if headline not in headlines_related:
                    headlines_related.append((date,headline))
len(headlines_related)
1739
headlines_related[:10]
[('2020-06-08',
  "Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... - USA TODAY"),
 ('2020-06-17',
  "Here's the Defining Characteristic of Home Depot's Success - Nasdaq"),
 ('2020-06-17',
  'Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more - 9to5Toys'),
 ('2020-06-18',
  'Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more - 9to5Toys'),
 ('2020-06-24',
  '1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa - OCRegister'),
 ('2020-06-25',
  "'It's dehumanizing': Home Depot employee felt colleague's racist ... - Hamilton Spectator"),
 ('2020-06-30',
  'Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna-Glo ... - USA TODAY'),
 ('2020-06-30',
  'The Home Depot Announces Renewable Energy Goal and Pledges ... - PR Newswire'),
 ('2020-07-01',
  'Home Depot bans some rope sales after nooses were found tied on ... - Courier Journal'),
 ('2020-07-02',
  'Home Depot changes rope sales practice after nooses are found in store - CNN')]

df_news = pd.DataFrame(headlines_related, columns=['Date', 'Headline'])
df_news['Headline'] = df_news['Headline'].apply(lambda x: x.split('-')[0])
df_news['Date'] = pd.to_datetime(df_news['Date'])
print(df_news.shape)
df_news.drop_duplicates(subset='Headline', inplace=True)
print(df_news.shape)
df_news.head()
(1739, 2)
(1373, 2)
Date Headline
0 2020-06-08 Home Depot Father's Day sales: Save on DeWalt and Milwaukee ...
1 2020-06-17 Here's the Defining Characteristic of Home Depot's Success
2 2020-06-17 Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more
3 2020-06-18 Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more
4 2020-06-24 1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa

Sentiment Analysis of All Text Data:

Tweets:

!pip install vaderSentiment
tweets_df_marketwatch.drop("User Name", axis=1, inplace=True)
#combine tweets coming from markets, and marketwatch
markets = pd.concat([tweets_df, tweets_df_marketwatch])
markets.shape
(1169, 2)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer() 
def vader(row):
    comp = sid.polarity_scores(row)['compound']
    return comp
import re
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("amp;","&", x))
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("\n"," ", x))
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("\xa0"," ", x))
markets.head()
Date Created Tweets
0 2023-03-01 18:05:43+00:00 From Meta to Home Depot, corporate America is talking about AI on earnings calls
1 2023-02-21 19:06:41+00:00 Home Depot, Walmart and DocuSign. @RitikaGuptaTV has your stocks to watch this Tuesday
2 2023-02-21 15:15:07+00:00 Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers
3 2022-11-15 18:11:03+00:00 Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned
4 2022-08-16 15:09:26+00:00 Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off
markets['Date Created'] = markets['Date Created'].astype('str')
markets['Date Created'] = markets['Date Created'].apply(lambda x:x[:11])
markets.head()
Date Created Tweets
0 2023-03-01 From Meta to Home Depot, corporate America is talking about AI on earnings calls
1 2023-02-21 Home Depot, Walmart and DocuSign. @RitikaGuptaTV has your stocks to watch this Tuesday
2 2023-02-21 Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers
3 2022-11-15 Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned
4 2022-08-16 Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off
markets['Date Created'] = pd.to_datetime(markets['Date Created'])
markets.sort_values(by='Date Created', inplace=True)
markets.index =range(markets.shape[0])
markets.head()
Date Created Tweets
0 2020-06-06 What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH:
1 2020-06-08 Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says
2 2020-06-09 HD Supply misses on profit expectations but beats on sales
3 2020-06-11 Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel.
4 2020-06-11 Should I tell my sister that her husband, a notorious spender, has a secret credit card?
markets['VaderSent'] = markets['Tweets'].apply(vader)
markets
Date Created Tweets VaderSent
0 2020-06-06 What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH: -0.6310
1 2020-06-08 Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says 0.0000
2 2020-06-09 HD Supply misses on profit expectations but beats on sales 0.1280
3 2020-06-11 Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel. 0.0000
4 2020-06-11 Should I tell my sister that her husband, a notorious spender, has a secret credit card? -0.0772
... ... ... ...
1164 2023-05-30 Trans designer in Target anti-LGBTQ+ backlash says he was ‘dealt the worst hand’ -0.6249
1165 2023-05-31 Brown-Forman to invest $200 million to expand Tequila distillery in Jalisco, Mexico 0.3182
1166 2023-06-02 Zelle and Chase working to resolve duplicate-payments issue 0.3818
1167 2023-06-03 The ‘best job in America’ pays over $120,000 a year, offers low stress, healthy work-life balance — and its workers are in high demand -0.4019
1168 2023-06-05 Cava sets IPO terms, as restaurant chain is set to be valued at more than $2 billion 0.4404

1169 rows × 3 columns

#Vader gives a sentiment value between -1 and 1, -1 being the most negative, 
#1 being the most positive and 0 being neutral. We will put them in the corresponding
# pos, neu or neg bins based on this value.
def bins(value):
    hold_sent = []
    if -1 <= value < -0.33:
        hold_sent.append('neg')
    if -0.33<=value<=0.33:
        hold_sent.append('neu')
    if 0.33 < value <= 1:
        hold_sent.append('pos')
    return hold_sent
markets['sentiment'] = markets['VaderSent'].apply(bins)
values=[]
for i in markets['sentiment'].values:
    values.append(i[0])
markets['sentiment']=values
markets.head(10)
Date Created Tweets VaderSent sentiment
0 2020-06-06 What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH: -0.6310 neg
1 2020-06-08 Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says 0.0000 neu
2 2020-06-09 HD Supply misses on profit expectations but beats on sales 0.1280 neu
3 2020-06-11 Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel. 0.0000 neu
4 2020-06-11 Should I tell my sister that her husband, a notorious spender, has a secret credit card? -0.0772 neu
5 2020-06-12 Palantir Technologies Inc. is reportedly preparing to confidentially file for its long-awaited IPO. 0.0000 neu
6 2020-06-15 It's a well-known secret of Wall Street: Little business actually takes place in New York. We spoke to several Wall Street road warriors to learn what the past 3 months have been like when they can't fly anywhere. 0.3612 pos
7 2020-06-16 Dear airline passengers: Wear your face mask or you might get banned from flying. -0.1027 neu
8 2020-06-18 Facebook takes down Trump-Pence ads featuring symbols previously used by Nazis 0.0000 neu
9 2020-06-18 Dow opens with 170 point drop as jobless claims stay elevated -0.2732 neu
#Each week determine the weight of positive, neutral and negative tweets:
tweets_weight = pd.DataFrame(markets.groupby([pd.Grouper(key='Date Created', freq='W')])['sentiment'], columns=['Date','Data'])
tweets_weight.head()
for i in range(len(tweets_weight)):
    n = dict(tweets_weight.loc[i,'Data'].value_counts(normalize=True))
    if 'neg' not in n:
        n['neg']=0
    if 'pos' not in n:
        n['pos']=0
    if 'neu' not in n:
        n['neu']=0
    tweets_weight.loc[i,'tweet_pos'] = n['pos']
    tweets_weight.loc[i,'tweet_neg'] = n['neg']
    tweets_weight.loc[i,'tweet_neu'] = n['neu']
tweets_weight
Date Data tweet_pos tweet_neg tweet_neu
0 2020-06-07 0 neg Name: sentiment, dtype: object 0.000000 1.000000 0.000000
1 2020-06-14 1 neu 2 neu 3 neu 4 neu 5 neu Name: sentiment, dtype: object 0.000000 0.000000 1.000000
2 2020-06-21 6 pos 7 neu 8 neu 9 neu 10 pos 11 neu Name: sentiment, dtype: object 0.333333 0.000000 0.666667
3 2020-06-28 12 neu 13 neu 14 neu 15 neu 16 neg 17 neu 18 neu 19 neg 20 neu Name: sentiment, dtype: object 0.000000 0.222222 0.777778
4 2020-07-05 21 neg 22 neg 23 neg 24 neg 25 pos 26 neu 27 neg 28 neu 29 neu 30 neu Name: sentiment, dtype: object 0.100000 0.500000 0.400000
... ... ... ... ... ...
153 2023-05-14 1141 pos 1142 neu 1143 neu 1144 neu 1145 neg Name: sentiment, dtype: object 0.200000 0.200000 0.600000
154 2023-05-21 1146 neu 1147 neu 1148 neu 1149 neu 1150 neu 1151 neu 1152 neg 1153 neu 1154 neg 1155 neu 1156 pos 1157 neu 1158 neu 1159 neu 1160 neu Name: sentiment, dtype: object 0.066667 0.133333 0.800000
155 2023-05-28 1161 neg 1162 neg 1163 neg Name: sentiment, dtype: object 0.000000 1.000000 0.000000
156 2023-06-04 1164 neg 1165 neu 1166 pos 1167 neg Name: sentiment, dtype: object 0.250000 0.500000 0.250000
157 2023-06-11 1168 pos Name: sentiment, dtype: object 1.000000 0.000000 0.000000

158 rows × 5 columns

tweets_weight.drop('Data', inplace=True, axis=1)
tweets_weight.head()
Date tweet_pos tweet_neg tweet_neu
0 2020-06-07 0.000000 1.000000 0.000000
1 2020-06-14 0.000000 0.000000 1.000000
2 2020-06-21 0.333333 0.000000 0.666667
3 2020-06-28 0.000000 0.222222 0.777778
4 2020-07-05 0.100000 0.500000 0.400000
tweets_weight.plot(x='Date', y=['tweet_pos','tweet_neu','tweet_neg'],title='Tweets\' Sentiment')
<AxesSubplot: title={'center': "Tweets' Sentiment"}, xlabel='Date'>

Tweets were mostly neutral in tone. Unsurprisingly, there is a negative correlation between positive and negative tweets.

Google News:

df_news['VaderSent'] = df_news['Headline'].apply(vader)
df_news.head(10)
Date Headline VaderSent
0 2020-06-08 Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... 0.4939
1 2020-06-17 Here's the Defining Characteristic of Home Depot's Success 0.5719
2 2020-06-17 Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more 0.0000
3 2020-06-18 Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more 0.2500
4 2020-06-24 1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa -0.8860
5 2020-06-25 'It's dehumanizing': Home Depot employee felt colleague's racist ... -0.8126
6 2020-06-30 Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna 0.2023
7 2020-06-30 The Home Depot Announces Renewable Energy Goal and Pledges ... 0.2732
8 2020-07-01 Home Depot bans some rope sales after nooses were found tied on ... 0.0000
9 2020-07-02 Home Depot changes rope sales practice after nooses are found in store 0.0000
df_news['Date'] = pd.to_datetime(df_news['Date'])
df_news['sentiment'] = df_news['VaderSent'].apply(bins)
values=[]
for i in df_news['sentiment'].values:
    values.append(i[0])
df_news['sentiment']=values
df_news.head(10)
Date Headline VaderSent sentiment
0 2020-06-08 Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... 0.4939 pos
1 2020-06-17 Here's the Defining Characteristic of Home Depot's Success 0.5719 pos
2 2020-06-17 Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more 0.0000 neu
3 2020-06-18 Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more 0.2500 neu
4 2020-06-24 1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa -0.8860 neg
5 2020-06-25 'It's dehumanizing': Home Depot employee felt colleague's racist ... -0.8126 neg
6 2020-06-30 Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna 0.2023 neu
7 2020-06-30 The Home Depot Announces Renewable Energy Goal and Pledges ... 0.2732 neu
8 2020-07-01 Home Depot bans some rope sales after nooses were found tied on ... 0.0000 neu
9 2020-07-02 Home Depot changes rope sales practice after nooses are found in store 0.0000 neu
news_weight = pd.DataFrame(df_news.groupby([pd.Grouper(key='Date', freq='W')])['sentiment'], columns=['Date','Data'])
for i in range(len(news_weight)):
    n = dict(news_weight.loc[i,'Data'].value_counts(normalize=True))
    if 'neg' not in n:
        n['neg']=0
    if 'pos' not in n:
        n['pos']=0
    if 'neu' not in n:
        n['neu']=0
    news_weight.loc[i,'news_pos'] = n['pos']
    news_weight.loc[i,'news_neg'] = n['neg']
    news_weight.loc[i,'news_neu'] = n['neu']
news_weight
Date Data news_pos news_neg news_neu
0 2020-06-14 0 pos Name: sentiment, dtype: object 1.000000 0.000000 0.000000
1 2020-06-21 1 pos 2 neu 3 neu Name: sentiment, dtype: object 0.333333 0.000000 0.666667
2 2020-06-28 4 neg 5 neg Name: sentiment, dtype: object 0.000000 1.000000 0.000000
3 2020-07-05 6 neu 7 neu 8 neu 9 neu 10 neu 11 neg Name: sentiment, dtype: object 0.000000 0.166667 0.833333
4 2020-07-12 12 neu 13 neu Name: sentiment, dtype: object 0.000000 0.000000 1.000000
... ... ... ... ... ...
152 2023-05-14 1546 pos 1547 neu 1548 neu 1549 pos 1550 pos 1551 neu 1552 neu 1562 neu 1570 neu 1571 neg 1577 neu 1578 neu 1579 neu 1581 neu 1584 neg 1586 neu Name: sentiment, dtype: object 0.187500 0.125000 0.687500
153 2023-05-21 1587 neg 1589 neu 1590 neu 1592 neu 1593 neg 1594 neg 1595 neu 1596 neg 1597 neu 1598 neu 1599 neu 1600 neu 1601 pos 1602 neu 1603 neu 1604 neu 1605 neu 1606 neu 1608 neu 1609 neg 1610 neu 1611 neg 1614 neg 1615 pos 1616 neu 1619 neu 1620 neu 1621 neu 1623 pos 1631 neu 1632 neu 1633 neu Name: sentiment, dtype: object 0.093750 0.218750 0.687500
154 2023-05-28 1634 neu 1638 pos 1640 pos 1641 neu 1642 neu 1643 neg 1646 neu 1647 neu 1648 neu 1649 neg 1650 neu 1658 neu 1659 neg 1666 neu 1667 neu 1668 neu 1669 neu 1670 neu 1671 neg 1672 neu 1673 neg 1674 neg 1676 pos Name: sentiment, dtype: object 0.130435 0.260870 0.608696
155 2023-06-04 1677 pos 1679 neg 1680 neu 1682 neu 1683 neg 1685 neu 1692 neu 1694 neg 1696 neg 1698 pos 1699 pos 1700 pos 1704 pos 1705 neu 1706 neg 1708 neu 1710 pos 1711 neu 1713 neu 1716 neu 1718 neu 1719 neg 1721 neu 1722 neu 1724 neu Name: sentiment, dtype: object 0.240000 0.240000 0.520000
156 2023-06-11 1725 pos 1726 neu 1727 neu 1728 neu 1731 neg 1732 neu 1733 neg 1734 neu 1735 neu 1736 neg 1737 neg Name: sentiment, dtype: object 0.090909 0.363636 0.545455

157 rows × 5 columns

news_weight.drop('Data', inplace=True, axis=1)
news_weight
Date news_pos news_neg news_neu
0 2020-06-14 1.000000 0.000000 0.000000
1 2020-06-21 0.333333 0.000000 0.666667
2 2020-06-28 0.000000 1.000000 0.000000
3 2020-07-05 0.000000 0.166667 0.833333
4 2020-07-12 0.000000 0.000000 1.000000
... ... ... ... ...
152 2023-05-14 0.187500 0.125000 0.687500
153 2023-05-21 0.093750 0.218750 0.687500
154 2023-05-28 0.130435 0.260870 0.608696
155 2023-06-04 0.240000 0.240000 0.520000
156 2023-06-11 0.090909 0.363636 0.545455

157 rows × 4 columns

news_weight.plot(x='Date', y=['news_pos','news_neg','news_neu'], title='News Sentiment')
<AxesSubplot: title={'center': 'News Sentiment'}, xlabel='Date'>

News were also mostly neutral in tone. Again, there is a negative correlation between positive and negative tweets.

df_combined.drop('ticker', inplace=True, axis=1)
df_combined.head()
open high low close adjclose volume TwoWeeks Month Home Depot
2020-07-06 250.270004 251.500000 247.039993 249.550003 232.537186 3133800.0 247.481999 248.1370 90.0
2020-07-07 247.369995 250.779999 247.070007 247.350006 230.487167 2927800.0 247.300999 247.6660 90.0
2020-07-08 247.869995 249.789993 246.220001 249.169998 232.183075 2294000.0 247.187000 247.2865 90.0
2020-07-09 249.660004 250.509995 246.350006 247.960007 231.055588 2994700.0 247.370000 246.9620 90.0
2020-07-10 248.289993 250.330002 246.639999 250.110001 233.058990 2745300.0 247.842999 247.4940 90.0
df_combined = df_combined.reset_index()
df_combined.rename(columns={'index':'Date'}, inplace=True)
df_combined.head()
Date open high low close adjclose volume TwoWeeks Month Home Depot
0 2020-07-06 250.270004 251.500000 247.039993 249.550003 232.537186 3133800.0 247.481999 248.1370 90.0
1 2020-07-07 247.369995 250.779999 247.070007 247.350006 230.487167 2927800.0 247.300999 247.6660 90.0
2 2020-07-08 247.869995 249.789993 246.220001 249.169998 232.183075 2294000.0 247.187000 247.2865 90.0
3 2020-07-09 249.660004 250.509995 246.350006 247.960007 231.055588 2994700.0 247.370000 246.9620 90.0
4 2020-07-10 248.289993 250.330002 246.639999 250.110001 233.058990 2745300.0 247.842999 247.4940 90.0
import numpy as np
df_weekly = pd.DataFrame(df_combined.groupby([pd.Grouper(key='Date', freq='W')]).agg(np.mean))
df_weekly.head()
open high low close adjclose volume TwoWeeks Month Home Depot
Date
2020-07-12 248.691998 250.581998 246.664001 248.828003 231.864401 2819120.0 247.436599 247.509100 90.0
2020-07-19 255.570004 259.695996 253.284000 256.733997 239.231400 3799580.0 250.709800 249.001500 86.0
2020-07-26 263.217999 265.070001 261.105994 263.376007 245.420612 2661120.0 256.946602 252.191601 85.0
2020-08-02 265.357996 267.876001 263.217999 265.832001 247.709158 2700400.0 263.277204 256.993502 85.0
2020-08-09 267.792004 270.044006 266.348004 268.508002 250.202713 2319420.0 266.128600 261.537601 80.0
df_final = df_weekly.merge(news_weight, on='Date').merge(tweets_weight, on='Date')
df_final.columns = [i.lower() for i in df_final.columns]
df_final.head()
date open high low close adjclose volume twoweeks month home depot news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 248.691998 250.581998 246.664001 248.828003 231.864401 2819120.0 247.436599 247.509100 90.0 0.000000 0.00 1.000000 0.000000 0.166667 0.833333
1 2020-07-19 255.570004 259.695996 253.284000 256.733997 239.231400 3799580.0 250.709800 249.001500 86.0 0.000000 0.75 0.250000 0.000000 0.400000 0.600000
2 2020-07-26 263.217999 265.070001 261.105994 263.376007 245.420612 2661120.0 256.946602 252.191601 85.0 0.333333 0.00 0.666667 0.125000 0.125000 0.750000
3 2020-08-02 265.357996 267.876001 263.217999 265.832001 247.709158 2700400.0 263.277204 256.993502 85.0 0.200000 0.20 0.600000 0.181818 0.272727 0.545455
4 2020-08-09 267.792004 270.044006 266.348004 268.508002 250.202713 2319420.0 266.128600 261.537601 80.0 0.333333 0.00 0.666667 0.333333 0.222222 0.444444
df_final.rename(columns={"home depot":"trends"}, inplace=True)
df_final.head()
date open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 248.691998 250.581998 246.664001 248.828003 231.864401 2819120.0 247.436599 247.509100 90.0 0.000000 0.00 1.000000 0.000000 0.166667 0.833333
1 2020-07-19 255.570004 259.695996 253.284000 256.733997 239.231400 3799580.0 250.709800 249.001500 86.0 0.000000 0.75 0.250000 0.000000 0.400000 0.600000
2 2020-07-26 263.217999 265.070001 261.105994 263.376007 245.420612 2661120.0 256.946602 252.191601 85.0 0.333333 0.00 0.666667 0.125000 0.125000 0.750000
3 2020-08-02 265.357996 267.876001 263.217999 265.832001 247.709158 2700400.0 263.277204 256.993502 85.0 0.200000 0.20 0.600000 0.181818 0.272727 0.545455
4 2020-08-09 267.792004 270.044006 266.348004 268.508002 250.202713 2319420.0 266.128600 261.537601 80.0 0.333333 0.00 0.666667 0.333333 0.222222 0.444444
#Scaling the data
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()
df_final_scaled = pd.DataFrame(SS.fit_transform(df_final.iloc[:,1:]), columns=df_final.columns[1:])
df_final_scaled.insert(0, "date", df_final['date'])
df_final_scaled.head()
date open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 -1.738792 -1.736071 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049
1 2020-07-19 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 -1.644438 -1.692888 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837
2 2020-07-26 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 -1.464655 -1.600581 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759
3 2020-08-02 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 -1.282168 -1.461636 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320
4 2020-08-09 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 -1.199974 -1.330150 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638
df_final_scaled.corr().round(3)
<ipython-input-214-f6a92ee3555b>:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df_final_scaled.corr().round(3)
open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
open 1.000 0.999 0.999 0.998 0.986 -0.051 0.983 0.954 -0.221 0.241 -0.043 -0.148 0.009 -0.075 0.058
high 0.999 1.000 0.998 0.999 0.988 -0.040 0.982 0.954 -0.227 0.246 -0.038 -0.156 0.010 -0.074 0.056
low 0.999 0.998 1.000 0.999 0.985 -0.075 0.978 0.947 -0.210 0.238 -0.045 -0.145 0.008 -0.079 0.062
close 0.998 0.999 0.999 1.000 0.988 -0.059 0.978 0.949 -0.218 0.245 -0.041 -0.152 0.009 -0.077 0.059
adjclose 0.986 0.988 0.985 0.988 1.000 -0.042 0.970 0.945 -0.321 0.267 -0.013 -0.192 0.011 -0.072 0.053
volume -0.051 -0.040 -0.075 -0.059 -0.042 1.000 0.018 0.058 -0.057 0.048 -0.005 -0.032 0.014 0.090 -0.092
twoweeks 0.983 0.982 0.978 0.978 0.970 0.018 1.000 0.986 -0.266 0.248 -0.035 -0.160 0.006 -0.077 0.062
month 0.954 0.954 0.947 0.949 0.945 0.058 0.986 1.000 -0.311 0.229 -0.034 -0.146 -0.003 -0.074 0.068
trends -0.221 -0.227 -0.210 -0.218 -0.321 -0.057 -0.266 -0.311 1.000 -0.227 -0.072 0.227 -0.015 0.006 0.008
news_pos 0.241 0.246 0.238 0.245 0.267 0.048 0.248 0.229 -0.227 1.000 -0.151 -0.637 -0.074 0.002 0.066
news_neg -0.043 -0.038 -0.045 -0.041 -0.013 -0.005 -0.035 -0.034 -0.072 -0.151 1.000 -0.666 -0.004 -0.005 0.008
news_neu -0.148 -0.156 -0.145 -0.152 -0.192 -0.032 -0.160 -0.146 0.227 -0.637 -0.666 1.000 0.059 0.003 -0.056
tweet_pos 0.009 0.010 0.008 0.009 0.011 0.014 0.006 -0.003 -0.015 -0.074 -0.004 0.059 1.000 -0.378 -0.581
tweet_neg -0.075 -0.074 -0.079 -0.077 -0.072 0.090 -0.077 -0.074 0.006 0.002 -0.005 0.003 -0.378 1.000 -0.534
tweet_neu 0.058 0.056 0.062 0.059 0.053 -0.092 0.062 0.068 0.008 0.066 0.008 -0.056 -0.581 -0.534 1.000

There is a positive correlation between positive news and the stock price in two weeks. We don't see this correlation with positive tweets or any other sentiment columns. Additionally, there is a negative correlation between trends and the stock price in two weeks. This means when people start searching Home Depot on Google News, stock prices starting falling in two weeks.

cols=['news_pos', 'news_neg','news_neu', 'tweet_pos', 'tweet_neg','tweet_neu', 'trends']
fig, ax =plt.subplots(3,3, figsize=(15, 12))
ax=ax.ravel()
for i in range(len(cols)):
    df_final_scaled.plot(x='date', y=['close','twoweeks','month', cols[i]], ax=ax[i], title=cols[i])
    ax[7].axis('off')
    ax[8].axis('off')
plt.tight_layout()

\We can observe the correlation mentioned above in the plots as well. The first plot, representing positive news, indicates that positive news loosely follows the stock price. On the other hand, the last plot depicting trends shows a negative correlation between Google searches and the stock price. Furthermore, over time, the number of Google searches for "Home Depot" has steadily declined.

Applying Linear Regression on Historical Price Data:

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use("ggplot")
df_final_scaled = pd.read_csv('df_final_home_depot_scaled.csv')
df_final_scaled.drop('Unnamed: 0', inplace=True, axis=1)
df_final_scaled.head()
date open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 -1.738792 -1.736071 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049
1 2020-07-19 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 -1.644438 -1.692888 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837
2 2020-07-26 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 -1.464655 -1.600581 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759
3 2020-08-02 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 -1.282168 -1.461636 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320
4 2020-08-09 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 -1.199974 -1.330150 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638
x= df_final_scaled[['open','high','low','close', 'adjclose','volume','news_pos', 'news_neg', 'news_neu', 'tweet_pos',
       'tweet_neg', 'tweet_neu', 'trends']]
y=df_final_scaled['twoweeks']
import statsmodels.api as sm
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               twoweeks   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.975
Method:                 Least Squares   F-statistic:                     537.8
Date:                Mon, 12 Jun 2023   Prob (F-statistic):          3.14e-109
Time:                        02:36:26   Log-Likelihood:                 70.561
No. Observations:                 153   AIC:                            -117.1
Df Residuals:                     141   BIC:                            -80.76
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.501e-16      0.013  -1.17e-14      1.000      -0.025       0.025
open           2.3468      0.511      4.594      0.000       1.337       3.357
high          -0.2428      0.541     -0.448      0.655      -1.313       0.828
low           -0.7347      0.559     -1.314      0.191      -1.840       0.371
close         -0.2749      0.545     -0.504      0.615      -1.352       0.802
adjclose      -0.1252      0.118     -1.064      0.289      -0.358       0.107
volume         0.0494      0.017      2.981      0.003       0.017       0.082
news_pos       0.0014      0.010      0.141      0.888      -0.019       0.022
news_neg       0.0043      0.010      0.438      0.662      -0.015       0.023
news_neu      -0.0044      0.007     -0.594      0.554      -0.019       0.010
tweet_pos     -0.0020      0.009     -0.229      0.819      -0.019       0.015
tweet_neg     -0.0071      0.009     -0.778      0.438      -0.025       0.011
tweet_neu      0.0081      0.008      1.009      0.315      -0.008       0.024
trends        -0.0524      0.019     -2.785      0.006      -0.090      -0.015
==============================================================================
Omnibus:                        1.461   Durbin-Watson:                   1.694
Prob(Omnibus):                  0.482   Jarque-Bera (JB):                1.534
Skew:                           0.193   Prob(JB):                        0.464
Kurtosis:                       2.697   Cond. No.                     5.93e+15
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.26e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
x= df_final_scaled[['open','high','low','close', 'adjclose','volume']]
y=df_final_scaled['twoweeks']
from sklearn.model_selection import train_test_split 
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3 , shuffle=False,random_state = 0)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
regression = LinearRegression()
regression.fit(train_x, train_y)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)
regression coefficient [ 2.15513818  0.1568129  -0.63183193 -0.94351993  0.2494587   0.05989017]
regression intercept 0.017827179785784394
regression_confidence = regression.score(test_x, test_y)
print("linear regression confidence: ", regression_confidence)
linear regression confidence:  0.8354280496114086
predicted=regression.predict(test_x)
print(test_x.head())
         open      high       low     close  adjclose    volume
107 -0.243796 -0.255966 -0.239776 -0.208430 -0.104242 -0.699127
108 -0.115385 -0.079953 -0.082182 -0.058794  0.042456 -0.785429
109  0.099918  0.070874  0.110736  0.082941  0.181410 -0.899154
110  0.366759  0.437578  0.380624  0.428573  0.520257  0.542134
111  0.080282  0.039007  0.007433 -0.028003  0.072644 -0.737113
dfr=pd.DataFrame({'Actual_Price':test_y, 'Predicted_Price':predicted})
dfr.head(10)
Actual_Price Predicted_Price
107 -0.251375 -0.267445
108 -0.142925 -0.172431
109 -0.051125 0.087459
110 0.166720 0.394254
111 0.250333 0.192664
112 -0.043019 -0.309534
113 -0.333535 -0.397486
114 -0.476311 -0.423891
115 -0.740981 -0.954391
116 -0.992016 -0.840699
dfr.describe()
Actual_Price Predicted_Price
count 46.000000 46.000000
mean -0.191756 -0.132461
std 0.438800 0.433547
min -0.992016 -0.954391
25% -0.502699 -0.417290
50% -0.331794 -0.271980
75% 0.274308 0.247046
max 0.478988 0.858482
from sklearn import metrics
import numpy as np
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test_y, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(test_y, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(test_y, predicted)))
Mean Absolute Error (MAE): 0.14249150773348154
Mean Squared Error (MSE) : 0.030998755667901733
Root Mean Squared Error (RMSE): 0.17606463491542454
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Prediction using only Historical Data")
plt.legend()
<matplotlib.legend.Legend at 0x7fc3b62ce770>

Applying Linear Regression to both historical and newly added features:

df_final_scaled.columns
Index(['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
       'twoweeks', 'month', 'trends', 'news_pos', 'news_neg', 'news_neu',
       'tweet_pos', 'tweet_neg', 'tweet_neu'],
      dtype='object')
x= df_final_scaled[['open','high','low','close', 'adjclose','volume','news_pos', 'news_neg', 'news_neu', 'tweet_pos',
       'tweet_neg', 'tweet_neu', 'trends']]

y=df_final_scaled['twoweeks']
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3 , shuffle=False,random_state = 0)
regression = LinearRegression()
regression.fit(train_x, train_y)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)
regression coefficient [ 1.91871810e+00  1.89637622e-01 -3.49725608e-01 -8.35917172e-01
  4.63540880e-02  6.48403899e-02  1.45467759e-03  5.40953328e-03
 -5.31782650e-03 -3.13379351e-03 -7.57685954e-03  9.52332816e-03
 -5.55338621e-02]
regression intercept 0.023765631258188702
regression_confidence = regression.score(test_x, test_y)
print("linear regression confidence: ", regression_confidence)
linear regression confidence:  0.8294859517339385
predicted=regression.predict(test_x)
print(test_x.head())
         open      high       low     close  adjclose    volume  news_pos  \
107 -0.243796 -0.255966 -0.239776 -0.208430 -0.104242 -0.699127  0.332721   
108 -0.115385 -0.079953 -0.082182 -0.058794  0.042456 -0.785429 -1.106820   
109  0.099918  0.070874  0.110736  0.082941  0.181410 -0.899154  0.332721   
110  0.366759  0.437578  0.380624  0.428573  0.520257  0.542134  0.512664   
111  0.080282  0.039007  0.007433 -0.028003  0.072644 -0.737113  0.512664   

     news_neg  news_neu  tweet_pos  tweet_neg  tweet_neu    trends  
107  0.894529 -0.948910   0.533344   1.071344  -1.428981 -0.475612  
108  0.894529  0.137234   0.824251  -1.044107   0.165137 -0.380115  
109  0.894529 -0.948910  -1.503002   0.567665   0.873634 -0.666606  
110 -0.802239  0.239060   0.471637  -0.018434  -0.414542 -0.666606  
111  0.372447 -0.677374   1.890908  -0.338957  -1.428981 -0.475612  
from sklearn.metrics import r2_score
r2_score(predicted, test_y)
0.8290817124619406
dfr=pd.DataFrame({'Actual_Price':test_y, 'Predicted_Price':predicted})
dfr.head(10)
Actual_Price Predicted_Price
107 -0.251375 -0.271245
108 -0.142925 -0.153351
109 -0.051125 0.127086
110 0.166720 0.405224
111 0.250333 0.177392
112 -0.043019 -0.300313
113 -0.333535 -0.381184
114 -0.476311 -0.385414
115 -0.740981 -0.939092
116 -0.992016 -0.805985
dfr.describe()
Actual_Price Predicted_Price
count 46.000000 46.000000
mean -0.191756 -0.112709
std 0.438800 0.438281
min -0.992016 -0.939092
25% -0.502699 -0.419594
50% -0.331794 -0.269632
75% 0.274308 0.268095
max 0.478988 0.895990
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test_y, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(test_y, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(test_y, predicted)))
Mean Absolute Error (MAE): 0.14758907019543632
Mean Squared Error (MSE) : 0.0321180086136408
Root Mean Squared Error (RMSE): 0.17921497876472492
plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()
plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Predictions using  Historical, Sentiment and Trends")
plt.legend()
<matplotlib.legend.Legend at 0x7fc3b6000850>

Applying CNN+LSTM on Historical Data:

import pandas as pd
df = pd.read_csv('df_final_home_depot_scaled.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)
df.head()
date open high low close adjclose volume twoweeks month trends news_pos news_neg news_neu tweet_pos tweet_neg tweet_neu
0 2020-07-12 -1.703818 -1.730261 -1.682950 -1.705764 -1.872820 -0.840392 -1.738792 -1.736071 2.580289 -1.106820 -1.193801 1.766450 -1.503002 -0.103907 1.464049
1 2020-07-19 -1.505790 -1.470011 -1.490960 -1.477822 -1.658905 -0.124160 -1.644438 -1.692888 2.198301 -1.106820 3.504942 -1.899286 -1.503002 1.212374 0.306837
2 2020-07-26 -1.285591 -1.316556 -1.264110 -1.286323 -1.479190 -0.955812 -1.464655 -1.600581 2.102805 1.052492 -1.193801 0.137234 -0.824220 -0.338957 1.050759
3 2020-08-02 -1.223978 -1.236431 -1.202859 -1.215514 -1.412737 -0.927118 -1.282168 -1.461636 2.102805 0.188767 0.059197 -0.188609 -0.515683 0.494403 0.036320
4 2020-08-09 -1.153899 -1.174523 -1.112084 -1.138360 -1.340332 -1.205426 -1.199974 -1.330150 1.625320 1.052492 -1.193801 0.137234 0.307084 0.209494 -0.464638
df.columns
Index(['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
       'twoweeks', 'month', 'trends', 'news_pos', 'news_neg', 'news_neu',
       'tweet_pos', 'tweet_neg', 'tweet_neu'],
      dtype='object')
df_hist = df[['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
       'twoweeks', 'month']]
df_hist.shape
(153, 9)
#Append each row to X (high, low, open, close, adjclose, volume)
X = [[df_hist.iloc[j,i+1] for i in range(df_hist.shape[1]-3)] for j in range(df_hist.shape[0])]
#Append all "twoweeks" values
Y = [df_hist.iloc[i,7] for i in range(df_hist.shape[0])]
import numpy as np
from sklearn.model_selection import train_test_split 


x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)

train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)
train_X = train_X.reshape(train_X.shape[0],1,6,1)
test_X = test_X.reshape(test_X.shape[0],1,6,1)
print(len(train_X))
print(len(test_X))
107
46
train_X[0]
array([[[-1.09683374],
        [-1.10216509],
        [-1.12420655],
        [-1.11575665],
        [-1.28069411],
        [-0.48313096]]])
from tensorflow.keras import backend as K
K.image_data_format()=="channels_first"
False
# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError

model = tf.keras.Sequential()

# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None, 6, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
# model.add(Dense(5, kernel_regularizer=L2(0.01)))

# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))

#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)
Epoch 1/40
3/3 [==============================] - 13s 910ms/step - loss: 0.8740 - mse: 0.8740 - mae: 0.7390 - val_loss: 1.1876 - val_mse: 1.1876 - val_mae: 0.8232
Epoch 2/40
3/3 [==============================] - 0s 34ms/step - loss: 0.8003 - mse: 0.8003 - mae: 0.7047 - val_loss: 1.0375 - val_mse: 1.0375 - val_mae: 0.7627
Epoch 3/40
3/3 [==============================] - 0s 38ms/step - loss: 0.6692 - mse: 0.6692 - mae: 0.6315 - val_loss: 0.7258 - val_mse: 0.7258 - val_mae: 0.6202
Epoch 4/40
3/3 [==============================] - 0s 42ms/step - loss: 0.3853 - mse: 0.3853 - mae: 0.4627 - val_loss: 0.2971 - val_mse: 0.2971 - val_mae: 0.3433
Epoch 5/40
3/3 [==============================] - 0s 41ms/step - loss: 0.1832 - mse: 0.1832 - mae: 0.2912 - val_loss: 0.1127 - val_mse: 0.1127 - val_mae: 0.2656
Epoch 6/40
3/3 [==============================] - 0s 46ms/step - loss: 0.1054 - mse: 0.1054 - mae: 0.2555 - val_loss: 0.2310 - val_mse: 0.2310 - val_mae: 0.3723
Epoch 7/40
3/3 [==============================] - 0s 67ms/step - loss: 0.1590 - mse: 0.1590 - mae: 0.3158 - val_loss: 0.1858 - val_mse: 0.1858 - val_mae: 0.3358
Epoch 8/40
3/3 [==============================] - 0s 60ms/step - loss: 0.1221 - mse: 0.1221 - mae: 0.2605 - val_loss: 0.0740 - val_mse: 0.0740 - val_mae: 0.2339
Epoch 9/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0769 - mse: 0.0769 - mae: 0.2229 - val_loss: 0.0902 - val_mse: 0.0902 - val_mae: 0.2187
Epoch 10/40
3/3 [==============================] - 0s 81ms/step - loss: 0.0740 - mse: 0.0740 - mae: 0.1926 - val_loss: 0.1062 - val_mse: 0.1062 - val_mae: 0.2100
Epoch 11/40
3/3 [==============================] - 0s 64ms/step - loss: 0.0807 - mse: 0.0807 - mae: 0.1986 - val_loss: 0.0893 - val_mse: 0.0893 - val_mae: 0.2025
Epoch 12/40
3/3 [==============================] - 0s 61ms/step - loss: 0.0891 - mse: 0.0891 - mae: 0.2086 - val_loss: 0.0575 - val_mse: 0.0575 - val_mae: 0.1793
Epoch 13/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0599 - mse: 0.0599 - mae: 0.1769 - val_loss: 0.0513 - val_mse: 0.0513 - val_mae: 0.1723
Epoch 14/40
3/3 [==============================] - 0s 65ms/step - loss: 0.0515 - mse: 0.0515 - mae: 0.1749 - val_loss: 0.0740 - val_mse: 0.0740 - val_mae: 0.2093
Epoch 15/40
3/3 [==============================] - 0s 57ms/step - loss: 0.0550 - mse: 0.0550 - mae: 0.1783 - val_loss: 0.0788 - val_mse: 0.0788 - val_mae: 0.2178
Epoch 16/40
3/3 [==============================] - 0s 56ms/step - loss: 0.0655 - mse: 0.0655 - mae: 0.1941 - val_loss: 0.0548 - val_mse: 0.0548 - val_mae: 0.1837
Epoch 17/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0431 - mse: 0.0431 - mae: 0.1603 - val_loss: 0.0412 - val_mse: 0.0412 - val_mae: 0.1610
Epoch 18/40
3/3 [==============================] - 0s 57ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1512 - val_loss: 0.0473 - val_mse: 0.0473 - val_mae: 0.1615
Epoch 19/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0426 - mse: 0.0426 - mae: 0.1599 - val_loss: 0.0502 - val_mse: 0.0502 - val_mae: 0.1643
Epoch 20/40
3/3 [==============================] - 0s 61ms/step - loss: 0.0579 - mse: 0.0579 - mae: 0.1759 - val_loss: 0.0423 - val_mse: 0.0423 - val_mae: 0.1570
Epoch 21/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0507 - mse: 0.0507 - mae: 0.1801 - val_loss: 0.0473 - val_mse: 0.0473 - val_mae: 0.1655
Epoch 22/40
3/3 [==============================] - 0s 71ms/step - loss: 0.0399 - mse: 0.0399 - mae: 0.1639 - val_loss: 0.0588 - val_mse: 0.0588 - val_mae: 0.1883
Epoch 23/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0468 - mse: 0.0468 - mae: 0.1620 - val_loss: 0.0428 - val_mse: 0.0428 - val_mae: 0.1614
Epoch 24/40
3/3 [==============================] - 0s 35ms/step - loss: 0.0417 - mse: 0.0417 - mae: 0.1593 - val_loss: 0.0419 - val_mse: 0.0419 - val_mae: 0.1566
Epoch 25/40
3/3 [==============================] - 0s 40ms/step - loss: 0.0481 - mse: 0.0481 - mae: 0.1619 - val_loss: 0.0418 - val_mse: 0.0418 - val_mae: 0.1554
Epoch 26/40
3/3 [==============================] - 0s 40ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1476 - val_loss: 0.0420 - val_mse: 0.0420 - val_mae: 0.1560
Epoch 27/40
3/3 [==============================] - 0s 38ms/step - loss: 0.0471 - mse: 0.0471 - mae: 0.1628 - val_loss: 0.0429 - val_mse: 0.0429 - val_mae: 0.1625
Epoch 28/40
3/3 [==============================] - 0s 35ms/step - loss: 0.0461 - mse: 0.0461 - mae: 0.1648 - val_loss: 0.0434 - val_mse: 0.0434 - val_mae: 0.1660
Epoch 29/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0360 - mse: 0.0360 - mae: 0.1533 - val_loss: 0.0427 - val_mse: 0.0427 - val_mae: 0.1616
Epoch 30/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0376 - mse: 0.0376 - mae: 0.1511 - val_loss: 0.0431 - val_mse: 0.0431 - val_mae: 0.1578
Epoch 31/40
3/3 [==============================] - 0s 37ms/step - loss: 0.0435 - mse: 0.0435 - mae: 0.1703 - val_loss: 0.0448 - val_mse: 0.0448 - val_mae: 0.1599
Epoch 32/40
3/3 [==============================] - 0s 39ms/step - loss: 0.0414 - mse: 0.0414 - mae: 0.1560 - val_loss: 0.0459 - val_mse: 0.0459 - val_mae: 0.1602
Epoch 33/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0437 - mse: 0.0437 - mae: 0.1640 - val_loss: 0.0477 - val_mse: 0.0477 - val_mae: 0.1672
Epoch 34/40
3/3 [==============================] - 0s 43ms/step - loss: 0.0421 - mse: 0.0421 - mae: 0.1515 - val_loss: 0.0508 - val_mse: 0.0508 - val_mae: 0.1743
Epoch 35/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0427 - mse: 0.0427 - mae: 0.1623 - val_loss: 0.0461 - val_mse: 0.0461 - val_mae: 0.1658
Epoch 36/40
3/3 [==============================] - 0s 34ms/step - loss: 0.0444 - mse: 0.0444 - mae: 0.1668 - val_loss: 0.0426 - val_mse: 0.0426 - val_mae: 0.1588
Epoch 37/40
3/3 [==============================] - 0s 38ms/step - loss: 0.0468 - mse: 0.0468 - mae: 0.1661 - val_loss: 0.0496 - val_mse: 0.0496 - val_mae: 0.1661
Epoch 38/40
3/3 [==============================] - 0s 38ms/step - loss: 0.0431 - mse: 0.0431 - mae: 0.1544 - val_loss: 0.0435 - val_mse: 0.0435 - val_mae: 0.1588
Epoch 39/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1587 - val_loss: 0.0440 - val_mse: 0.0440 - val_mae: 0.1577
Epoch 40/40
3/3 [==============================] - 0s 34ms/step - loss: 0.0546 - mse: 0.0546 - mae: 0.1726 - val_loss: 0.0544 - val_mse: 0.0544 - val_mae: 0.1774
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc355192470>
plt.plot(history.history['mse'], label='train mse')
plt.plot(history.history['val_mse'], label='val mse')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()
<matplotlib.legend.Legend at 0x7fc355252b30>