Predicting Stock Prices of Home Depot Based on Trends and the Sentiment of News and Tweets

import pandas as pd
import seaborn
import matplotlib.pyplot as plt
import datetime as dt
from datetime import date
from dateutil.relativedelta import relativedelta

!pip install yahoo_fin

Stock Data of Home Depot:

from yahoo_fin import stock_info as si

def stock_data(stock_name):
    df = si.get_data(stock_name, start_date=(date.today() - relativedelta(years=3)).strftime('%m/%d/%Y'), 
                     end_date=date.today().strftime('%m/%d/%Y'))
    return df
ticker='HD'
company_name='Home Depot'
df = stock_data(ticker)
df.head()

	open	high	low	close	adjclose	volume	ticker
2020-06-08	252.490005	256.809998	252.259995	256.769989	239.264954	3811900	HD
2020-06-09	255.330002	258.290009	253.860001	256.760010	239.255646	3716100	HD
2020-06-10	257.450012	259.290009	254.220001	254.449997	237.103104	3589800	HD
2020-06-11	248.860001	250.619995	238.740005	239.470001	223.144394	6563700	HD
2020-06-12	243.070007	246.389999	237.050003	242.449997	225.921204	5238500	HD

df.tail()

	open	high	low	close	adjclose	volume	ticker
2023-05-31	289.589996	290.000000	281.959991	283.450012	283.450012	18288800	HD
2023-06-01	284.049988	289.220001	279.980011	288.390015	288.390015	4305100	HD
2023-06-02	290.649994	296.209991	289.720001	295.940002	295.940002	4514600	HD
2023-06-05	295.619995	295.720001	291.369995	293.100006	293.100006	3028800	HD
2023-06-06	291.820007	296.920013	291.649994	296.000000	296.000000	2854800	HD

#Add target values for two weeks and one month. 
#Dataset has values for weekdays only, so two weeks will be 10 data points
# and one month will be 20 data points
df['TwoWeeks'] = df['close'].rolling(10).mean()
df['Month'] = df['close'].rolling(20).mean()
df.head()

	open	high	low	close	adjclose	volume	ticker	TwoWeeks	Month
2020-06-08	252.490005	256.809998	252.259995	256.769989	239.264938	3811900	HD	NaN	NaN
2020-06-09	255.330002	258.290009	253.860001	256.760010	239.255646	3716100	HD	NaN	NaN
2020-06-10	257.450012	259.290009	254.220001	254.449997	237.103119	3589800	HD	NaN	NaN
2020-06-11	248.860001	250.619995	238.740005	239.470001	223.144379	6563700	HD	NaN	NaN
2020-06-12	243.070007	246.389999	237.050003	242.449997	225.921188	5238500	HD	NaN	NaN

df.dropna(inplace=True)
df.head()

	open	high	low	close	adjclose	volume	ticker	TwoWeeks	Month
2020-07-06	250.270004	251.500000	247.039993	249.550003	232.537186	3133800	HD	247.481999	248.1370
2020-07-07	247.369995	250.779999	247.070007	247.350006	230.487167	2927800	HD	247.300999	247.6660
2020-07-08	247.869995	249.789993	246.220001	249.169998	232.183075	2294000	HD	247.187000	247.2865
2020-07-09	249.660004	250.509995	246.350006	247.960007	231.055588	2994700	HD	247.370000	246.9620
2020-07-10	248.289993	250.330002	246.639999	250.110001	233.058990	2745300	HD	247.842999	247.4940

Google Trends of Home Depot:

!pip install pytrends

import pytrends
from pytrends.request import TrendReq

pytrends = TrendReq()
kw_list=['Home Depot']

#Gather Google trends for ticker 'HD' and Home Depot
pytrends.build_payload(kw_list, geo='', timeframe='{} {}'.format((date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'),date.today().strftime('%Y-%m-%d')))#timeframe='2010-06-29 2023-05-16')
keyword_interest = pytrends.interest_over_time()
del keyword_interest['isPartial']
keyword_interest.columns.rename("{}".format(ticker))
keyword_interest.head()

	Home Depot
date
2020-06-07	100
2020-06-14	100
2020-06-21	96
2020-06-28	97
2020-07-05	90

keyword_interest['Home Depot'].plot(title='Google Trends for \'Home Depot\' in the last three years')

<AxesSubplot: title={'center': "Google Trends for 'Home Depot' in the last three years"}>

df_combined = pd.concat([df, keyword_interest], axis=1)
#df_combined.dropna(subset='Month', inplace=True)
df_combined.head(20)

	open	high	low	close	adjclose	volume	ticker	TwoWeeks	Month	Home Depot
2020-06-07	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	100.0
2020-06-14	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	100.0
2020-06-21	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	96.0
2020-06-28	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	97.0
2020-07-05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	90.0
2020-07-06	250.270004	251.500000	247.039993	249.550003	232.537186	3133800.0	HD	247.481999	248.137000	NaN
2020-07-07	247.369995	250.779999	247.070007	247.350006	230.487167	2927800.0	HD	247.300999	247.666000	NaN
2020-07-08	247.869995	249.789993	246.220001	249.169998	232.183075	2294000.0	HD	247.187000	247.286500	NaN
2020-07-09	249.660004	250.509995	246.350006	247.960007	231.055588	2994700.0	HD	247.370000	246.962000	NaN
2020-07-10	248.289993	250.330002	246.639999	250.110001	233.058990	2745300.0	HD	247.842999	247.494000	NaN
2020-07-12	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	86.0
2020-07-13	251.919998	257.859985	249.089996	249.619995	232.602386	4437500.0	HD	248.703999	247.852500	NaN
2020-07-14	249.000000	258.179993	248.460007	257.790009	240.215408	4614200.0	HD	249.871001	248.674001	NaN
2020-07-15	260.140015	261.290009	255.149994	257.799988	240.224716	4343700.0	HD	250.600000	249.066500	NaN
2020-07-16	256.760010	260.500000	256.000000	258.079987	240.485641	2511200.0	HD	251.592999	249.427999	NaN
2020-07-17	260.029999	260.649994	257.720001	260.380005	242.628845	3091300.0	HD	252.781000	249.986499	NaN
2020-07-19	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	85.0
2020-07-20	259.040009	261.200012	258.019989	260.170013	242.433228	2451100.0	HD	253.843001	250.662500	NaN
2020-07-21	261.619995	263.869995	260.720001	262.420013	244.529800	2439900.0	HD	255.350002	251.325500	NaN
2020-07-22	262.589996	265.589996	262.000000	265.170013	247.092316	2750000.0	HD	256.950003	252.068501	NaN

Google Trends give values for Sundays only when the data is pulled in years. I will move Sunday values to the week after since we are concerned with trends rather than daily values.

df_combined['Home Depot'] = df_combined['Home Depot'].fillna(method='ffill')
df_combined.dropna(inplace=True)
df_combined.head(30)

	open	high	low	close	adjclose	volume	ticker	TwoWeeks	Month	Home Depot
2020-07-06	250.270004	251.500000	247.039993	249.550003	232.537186	3133800.0	HD	247.481999	248.137000	90.0
2020-07-07	247.369995	250.779999	247.070007	247.350006	230.487167	2927800.0	HD	247.300999	247.666000	90.0
2020-07-08	247.869995	249.789993	246.220001	249.169998	232.183075	2294000.0	HD	247.187000	247.286500	90.0
2020-07-09	249.660004	250.509995	246.350006	247.960007	231.055588	2994700.0	HD	247.370000	246.962000	90.0
2020-07-10	248.289993	250.330002	246.639999	250.110001	233.058990	2745300.0	HD	247.842999	247.494000	90.0
2020-07-13	251.919998	257.859985	249.089996	249.619995	232.602386	4437500.0	HD	248.703999	247.852500	86.0
2020-07-14	249.000000	258.179993	248.460007	257.790009	240.215408	4614200.0	HD	249.871001	248.674001	86.0
2020-07-15	260.140015	261.290009	255.149994	257.799988	240.224716	4343700.0	HD	250.600000	249.066500	86.0
2020-07-16	256.760010	260.500000	256.000000	258.079987	240.485641	2511200.0	HD	251.592999	249.427999	86.0
2020-07-17	260.029999	260.649994	257.720001	260.380005	242.628845	3091300.0	HD	252.781000	249.986499	86.0
2020-07-20	259.040009	261.200012	258.019989	260.170013	242.433228	2451100.0	HD	253.843001	250.662500	85.0
2020-07-21	261.619995	263.869995	260.720001	262.420013	244.529800	2439900.0	HD	255.350002	251.325500	85.0
2020-07-22	262.589996	265.589996	262.000000	265.170013	247.092316	2750000.0	HD	256.950003	252.068501	85.0
2020-07-23	267.799988	267.799988	261.799988	263.809998	245.824997	2680100.0	HD	258.535002	252.952501	85.0
2020-07-24	265.040009	266.890015	262.989990	265.309998	247.222717	2984500.0	HD	260.055002	253.949001	85.0
2020-07-27	265.089996	268.679993	265.089996	267.420013	249.188919	2412500.0	HD	261.835004	255.269501	85.0
2020-07-28	268.559998	269.070007	264.670013	265.279999	247.194794	2227000.0	HD	262.584003	256.227502	85.0
2020-07-29	264.799988	267.109985	264.170013	264.660004	246.617065	2874100.0	HD	263.270004	256.935002	85.0
2020-07-30	263.339996	267.350006	261.549988	266.309998	248.154556	2347800.0	HD	264.093005	257.843002	85.0
2020-07-31	265.000000	267.170013	260.609985	265.489990	247.390457	3640600.0	HD	264.604004	258.692502	85.0
2020-08-03	266.730011	268.579987	265.670013	266.179993	248.033463	2363500.0	HD	265.205002	259.524001	80.0
2020-08-04	266.630005	267.890015	263.839996	267.869995	249.608231	2224000.0	HD	265.750000	260.550001	80.0
2020-08-05	268.390015	268.390015	265.890015	267.480011	249.244797	1959900.0	HD	265.981000	261.465501	80.0
2020-08-06	266.600006	270.440002	266.529999	269.369995	251.005936	2203400.0	HD	266.537000	262.536001	80.0
2020-08-07	270.609985	274.920013	269.809998	271.640015	253.121140	2846300.0	HD	267.170001	263.612502	80.0
2020-08-10	272.420013	275.000000	271.799988	274.730011	256.000580	2393100.0	HD	267.901001	264.868002	79.0
2020-08-11	277.690002	279.369995	274.410004	274.920013	256.177612	3321300.0	HD	268.865002	265.724503	79.0
2020-08-12	279.750000	282.970001	276.959991	281.579987	262.383514	3867900.0	HD	270.557001	266.913503	79.0
2020-08-13	281.160004	282.649994	279.739990	281.660004	262.458069	2202400.0	HD	272.092001	268.092503	79.0
2020-08-14	281.140015	282.000000	279.190002	280.549988	261.423767	2490400.0	HD	273.598001	269.101003	79.0

Home Depot in Twitter:

!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git

kw_list

['HD', 'Home Depot']


# The following code collects data related to 'HD' and Home Depot from 
# "markets" and "MarketWatch" twitter accounts published in the last three years

import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime as dt
from datetime import date

# Creating list to append tweet data to
attributes_container = []

sources_all =['CNBC','cnn', 'cnnbrk','MarketWatch', 'Benzinga', 'Stocktwits','BreakoutStocks', 
          'bespokeinvest','WSJMarkets','Stephanie_Link','nytimesbusiness','IBDinvestors','WSJDealJournal',
         'business', 'TheEconomist','WSJ', 'ABC', 'CBSNews','FoxNews', 'NBCNews']

sources=['markets','MarketWatch']
for tweet in sntwitter.TwitterSearchScraper('from:{} since:{} until:{}'.format(sources[0], (date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'), date.today().strftime('%Y-%m-%d'))).get_items():
    if 'Home Depot'.lower() in tweet.content.lower():
        attributes_container.append([tweet.date, tweet.content.split('http')[0]])
tweets_df = pd.DataFrame(attributes_container, columns=["Date Created", "Tweets"])
tweets_df.drop_duplicates(subset='Tweets',inplace=True)

attributes_container = []
for tweet in sntwitter.TwitterSearchScraper('from:{} since:{} until:{}'.format(sources[1], (date.today() - relativedelta(years=3)).strftime('%Y-%m-%d'), date.today().strftime('%Y-%m-%d'))).get_items():
    if 'Home Depot'.lower() in tweet.content.lower():
        attributes_container.append([tweet.date, tweet.content.split('http')[0]])
# Creating a dataframe from the tweets list above 
tweets_df_marketwatch = pd.DataFrame(attributes_container, columns=["Date Created", "Tweets"])
print(tweets_df_marketwatch.shape)
tweets_df_marketwatch.drop_duplicates(subset='Tweets',inplace=True)

pd.set_option("max_colwidth", None)
tweets_df.head()

	Date Created	Tweets
0	2023-03-01 18:05:43+00:00	From Meta to Home Depot, corporate America is talking about AI on earnings calls
1	2023-02-21 19:06:41+00:00	Home Depot, Walmart and DocuSign.\n\n@RitikaGuptaTV has your stocks to watch this Tuesday
2	2023-02-21 15:15:07+00:00	Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers
3	2022-11-15 18:11:03+00:00	Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned
4	2022-08-16 15:09:26+00:00	Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off

tweets_df_marketwatch.shape

(1159, 3)

Home Depot in Google News:

!pip install pygooglenews

kw_list

['Home Depot']


import pygooglenews
from pygooglenews import GoogleNews
gn = GoogleNews()
headlines_related = []

date_list = pd.date_range(end=date.today().strftime('%Y-%m-%d'), start=(date.today()-relativedelta(years=3)).strftime('%Y-%m-%d')).tolist()
clean_date = [str(i).split(" ")[0] for i in date_list]
for date in clean_date:
    headlines = []
    for word in kw_list:
        search =gn.search(word,when=date)
        for item in search['entries']:
            headlines.append(item['title'])
    #We have headlines of the news whose body may contain one of the keywords.
    #We will use only headlines to make predictions but some headlines do not contain any of the keywords.
    #Only include headlines that has one of the keywords
    for headline in headlines:
        for i in range(len(kw_list)):
            if kw_list[i] in headline:
                if headline not in headlines_related:
                    headlines_related.append((date,headline))
len(headlines_related)

headlines_related[:10]

[('2020-06-08',
  "Home Depot Father's Day sales: Save on DeWalt and Milwaukee ... - USA TODAY"),
 ('2020-06-17',
  "Here's the Defining Characteristic of Home Depot's Success - Nasdaq"),
 ('2020-06-17',
  'Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more - 9to5Toys'),
 ('2020-06-18',
  'Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more - 9to5Toys'),
 ('2020-06-24',
  '1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa - OCRegister'),
 ('2020-06-25',
  "'It's dehumanizing': Home Depot employee felt colleague's racist ... - Hamilton Spectator"),
 ('2020-06-30',
  'Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna-Glo ... - USA TODAY'),
 ('2020-06-30',
  'The Home Depot Announces Renewable Energy Goal and Pledges ... - PR Newswire'),
 ('2020-07-01',
  'Home Depot bans some rope sales after nooses were found tied on ... - Courier Journal'),
 ('2020-07-02',
  'Home Depot changes rope sales practice after nooses are found in store - CNN')]


df_news = pd.DataFrame(headlines_related, columns=['Date', 'Headline'])
df_news['Headline'] = df_news['Headline'].apply(lambda x: x.split('-')[0])
df_news['Date'] = pd.to_datetime(df_news['Date'])
print(df_news.shape)
df_news.drop_duplicates(subset='Headline', inplace=True)
print(df_news.shape)
df_news.head()

(1739, 2)
(1373, 2)

	Date	Headline
0	2020-06-08	Home Depot Father's Day sales: Save on DeWalt and Milwaukee ...
1	2020-06-17	Here's the Defining Characteristic of Home Depot's Success
2	2020-06-17	Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more
3	2020-06-18	Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more
4	2020-06-24	1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa

Sentiment Analysis of All Text Data:

Tweets:

!pip install vaderSentiment

tweets_df_marketwatch.drop("User Name", axis=1, inplace=True)

#combine tweets coming from markets, and marketwatch
markets = pd.concat([tweets_df, tweets_df_marketwatch])
markets.shape

(1169, 2)

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer() 
def vader(row):
    comp = sid.polarity_scores(row)['compound']
    return comp

import re
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("amp;","&", x))
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("\n"," ", x))
markets['Tweets'] = markets['Tweets'].apply(lambda x: re.sub("\xa0"," ", x))
markets.head()

	Date Created	Tweets
0	2023-03-01 18:05:43+00:00	From Meta to Home Depot, corporate America is talking about AI on earnings calls
1	2023-02-21 19:06:41+00:00	Home Depot, Walmart and DocuSign. @RitikaGuptaTV has your stocks to watch this Tuesday
2	2023-02-21 15:15:07+00:00	Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers
3	2022-11-15 18:11:03+00:00	Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned
4	2022-08-16 15:09:26+00:00	Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off

markets['Date Created'] = markets['Date Created'].astype('str')

markets['Date Created'] = markets['Date Created'].apply(lambda x:x[:11])
markets.head()

	Date Created	Tweets
0	2023-03-01	From Meta to Home Depot, corporate America is talking about AI on earnings calls
1	2023-02-21	Home Depot, Walmart and DocuSign. @RitikaGuptaTV has your stocks to watch this Tuesday
2	2023-02-21	Home Depot forecasts a fiscal-year profit decline and announces plans for a $1 billion wage investment for hourly workers
3	2022-11-15	Home Depot reported profit that beat expectations, with the CEO saying consumers are staying resilient. But a drop in transactions volume has investors concerned
4	2022-08-16	Home Depot’s second-quarter results beat Wall Street estimates even as the US housing market shows signs of cooling off

markets['Date Created'] = pd.to_datetime(markets['Date Created'])
markets.sort_values(by='Date Created', inplace=True)
markets.index =range(markets.shape[0])
markets.head()

	Date Created	Tweets
0	2020-06-06	What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH:
1	2020-06-08	Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says
2	2020-06-09	HD Supply misses on profit expectations but beats on sales
3	2020-06-11	Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel.
4	2020-06-11	Should I tell my sister that her husband, a notorious spender, has a secret credit card?

markets['VaderSent'] = markets['Tweets'].apply(vader)
markets

	Date Created	Tweets	VaderSent
0	2020-06-06	What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH:	-0.6310
1	2020-06-08	Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says	0.0000
2	2020-06-09	HD Supply misses on profit expectations but beats on sales	0.1280
3	2020-06-11	Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel.	0.0000
4	2020-06-11	Should I tell my sister that her husband, a notorious spender, has a secret credit card?	-0.0772
...	...	...	...
1164	2023-05-30	Trans designer in Target anti-LGBTQ+ backlash says he was ‘dealt the worst hand’	-0.6249
1165	2023-05-31	Brown-Forman to invest $200 million to expand Tequila distillery in Jalisco, Mexico	0.3182
1166	2023-06-02	Zelle and Chase working to resolve duplicate-payments issue	0.3818
1167	2023-06-03	The ‘best job in America’ pays over $120,000 a year, offers low stress, healthy work-life balance — and its workers are in high demand	-0.4019
1168	2023-06-05	Cava sets IPO terms, as restaurant chain is set to be valued at more than $2 billion	0.4404

1169 rows × 3 columns

#Vader gives a sentiment value between -1 and 1, -1 being the most negative, 
#1 being the most positive and 0 being neutral. We will put them in the corresponding
# pos, neu or neg bins based on this value.
def bins(value):
    hold_sent = []
    if -1 <= value < -0.33:
        hold_sent.append('neg')
    if -0.33<=value<=0.33:
        hold_sent.append('neu')
    if 0.33 < value <= 1:
        hold_sent.append('pos')
    return hold_sent
markets['sentiment'] = markets['VaderSent'].apply(bins)
values=[]
for i in markets['sentiment'].values:
    values.append(i[0])
markets['sentiment']=values
markets.head(10)

	Date Created	Tweets	VaderSent	sentiment
0	2020-06-06	What to make of the recent jobs numbers? Joseph Stiglitz says there's more than meets the eye, including those who aren't considered "unemployed," but aren't working. And without intervention, things may get worse, he says. WATCH:	-0.6310	neg
1	2020-06-08	Bull, bear, bull, bear and now a new bull market — whatever’s next, these stocks will outperform, strategist says	0.0000	neu
2	2020-06-09	HD Supply misses on profit expectations but beats on sales	0.1280	neu
3	2020-06-11	Normally, Madrid swells with tourists. Our editor @bkollmeyer writes about how locals are reclaiming their city in a summer without travel.	0.0000	neu
4	2020-06-11	Should I tell my sister that her husband, a notorious spender, has a secret credit card?	-0.0772	neu
5	2020-06-12	Palantir Technologies Inc. is reportedly preparing to confidentially file for its long-awaited IPO.	0.0000	neu
6	2020-06-15	It's a well-known secret of Wall Street: Little business actually takes place in New York. We spoke to several Wall Street road warriors to learn what the past 3 months have been like when they can't fly anywhere.	0.3612	pos
7	2020-06-16	Dear airline passengers: Wear your face mask or you might get banned from flying.	-0.1027	neu
8	2020-06-18	Facebook takes down Trump-Pence ads featuring symbols previously used by Nazis	0.0000	neu
9	2020-06-18	Dow opens with 170 point drop as jobless claims stay elevated	-0.2732	neu

#Each week determine the weight of positive, neutral and negative tweets:
tweets_weight = pd.DataFrame(markets.groupby([pd.Grouper(key='Date Created', freq='W')])['sentiment'], columns=['Date','Data'])
tweets_weight.head()
for i in range(len(tweets_weight)):
    n = dict(tweets_weight.loc[i,'Data'].value_counts(normalize=True))
    if 'neg' not in n:
        n['neg']=0
    if 'pos' not in n:
        n['pos']=0
    if 'neu' not in n:
        n['neu']=0
    tweets_weight.loc[i,'tweet_pos'] = n['pos']
    tweets_weight.loc[i,'tweet_neg'] = n['neg']
    tweets_weight.loc[i,'tweet_neu'] = n['neu']
tweets_weight

	Date	Data	tweet_pos	tweet_neg	tweet_neu
0	2020-06-07	0 neg Name: sentiment, dtype: object	0.000000	1.000000	0.000000
1	2020-06-14	1 neu 2 neu 3 neu 4 neu 5 neu Name: sentiment, dtype: object	0.000000	0.000000	1.000000
2	2020-06-21	6 pos 7 neu 8 neu 9 neu 10 pos 11 neu Name: sentiment, dtype: object	0.333333	0.000000	0.666667
3	2020-06-28	12 neu 13 neu 14 neu 15 neu 16 neg 17 neu 18 neu 19 neg 20 neu Name: sentiment, dtype: object	0.000000	0.222222	0.777778
4	2020-07-05	21 neg 22 neg 23 neg 24 neg 25 pos 26 neu 27 neg 28 neu 29 neu 30 neu Name: sentiment, dtype: object	0.100000	0.500000	0.400000
...	...	...	...	...	...
153	2023-05-14	1141 pos 1142 neu 1143 neu 1144 neu 1145 neg Name: sentiment, dtype: object	0.200000	0.200000	0.600000
154	2023-05-21	1146 neu 1147 neu 1148 neu 1149 neu 1150 neu 1151 neu 1152 neg 1153 neu 1154 neg 1155 neu 1156 pos 1157 neu 1158 neu 1159 neu 1160 neu Name: sentiment, dtype: object	0.066667	0.133333	0.800000
155	2023-05-28	1161 neg 1162 neg 1163 neg Name: sentiment, dtype: object	0.000000	1.000000	0.000000
156	2023-06-04	1164 neg 1165 neu 1166 pos 1167 neg Name: sentiment, dtype: object	0.250000	0.500000	0.250000
157	2023-06-11	1168 pos Name: sentiment, dtype: object	1.000000	0.000000	0.000000

158 rows × 5 columns

tweets_weight.drop('Data', inplace=True, axis=1)
tweets_weight.head()

	Date	tweet_pos	tweet_neg	tweet_neu
0	2020-06-07	0.000000	1.000000	0.000000
1	2020-06-14	0.000000	0.000000	1.000000
2	2020-06-21	0.333333	0.000000	0.666667
3	2020-06-28	0.000000	0.222222	0.777778
4	2020-07-05	0.100000	0.500000	0.400000

tweets_weight.plot(x='Date', y=['tweet_pos','tweet_neu','tweet_neg'],title='Tweets\' Sentiment')

<AxesSubplot: title={'center': "Tweets' Sentiment"}, xlabel='Date'>

Tweets were mostly neutral in tone. Unsurprisingly, there is a negative correlation between positive and negative tweets.

Google News:

df_news['VaderSent'] = df_news['Headline'].apply(vader)
df_news.head(10)

	Date	Headline	VaderSent
0	2020-06-08	Home Depot Father's Day sales: Save on DeWalt and Milwaukee ...	0.4939
1	2020-06-17	Here's the Defining Characteristic of Home Depot's Success	0.5719
2	2020-06-17	Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more	0.0000
3	2020-06-18	Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more	0.2500
4	2020-06-24	1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa	-0.8860
5	2020-06-25	'It's dehumanizing': Home Depot employee felt colleague's racist ...	-0.8126
6	2020-06-30	Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna	0.2023
7	2020-06-30	The Home Depot Announces Renewable Energy Goal and Pledges ...	0.2732
8	2020-07-01	Home Depot bans some rope sales after nooses were found tied on ...	0.0000
9	2020-07-02	Home Depot changes rope sales practice after nooses are found in store	0.0000

df_news['Date'] = pd.to_datetime(df_news['Date'])

df_news['sentiment'] = df_news['VaderSent'].apply(bins)
values=[]
for i in df_news['sentiment'].values:
    values.append(i[0])
df_news['sentiment']=values
df_news.head(10)

	Date	Headline	VaderSent	sentiment
0	2020-06-08	Home Depot Father's Day sales: Save on DeWalt and Milwaukee ...	0.4939	pos
1	2020-06-17	Here's the Defining Characteristic of Home Depot's Success	0.5719	pos
2	2020-06-17	Home Depot’s new outdoor tool sale takes up to 40% off RYOBI, DEWALT, more	0.0000	neu
3	2020-06-18	Home Depot’s Last Chance Father’s Day sale takes up to 35% off tools and more	0.2500	neu
4	2020-06-24	1 dead, 1 hurt in Home Depot parking lot crash in Costa Mesa	-0.8860	neg
5	2020-06-25	'It's dehumanizing': Home Depot employee felt colleague's racist ...	-0.8126	neg
6	2020-06-30	Home Depot 4th of July sale: Shop top deals on DeWalt, Dyna	0.2023	neu
7	2020-06-30	The Home Depot Announces Renewable Energy Goal and Pledges ...	0.2732	neu
8	2020-07-01	Home Depot bans some rope sales after nooses were found tied on ...	0.0000	neu
9	2020-07-02	Home Depot changes rope sales practice after nooses are found in store	0.0000	neu

news_weight = pd.DataFrame(df_news.groupby([pd.Grouper(key='Date', freq='W')])['sentiment'], columns=['Date','Data'])
for i in range(len(news_weight)):
    n = dict(news_weight.loc[i,'Data'].value_counts(normalize=True))
    if 'neg' not in n:
        n['neg']=0
    if 'pos' not in n:
        n['pos']=0
    if 'neu' not in n:
        n['neu']=0
    news_weight.loc[i,'news_pos'] = n['pos']
    news_weight.loc[i,'news_neg'] = n['neg']
    news_weight.loc[i,'news_neu'] = n['neu']
news_weight

	Date	Data	news_pos	news_neg	news_neu
0	2020-06-14	0 pos Name: sentiment, dtype: object	1.000000	0.000000	0.000000
1	2020-06-21	1 pos 2 neu 3 neu Name: sentiment, dtype: object	0.333333	0.000000	0.666667
2	2020-06-28	4 neg 5 neg Name: sentiment, dtype: object	0.000000	1.000000	0.000000
3	2020-07-05	6 neu 7 neu 8 neu 9 neu 10 neu 11 neg Name: sentiment, dtype: object	0.000000	0.166667	0.833333
4	2020-07-12	12 neu 13 neu Name: sentiment, dtype: object	0.000000	0.000000	1.000000
...	...	...	...	...	...
152	2023-05-14	1546 pos 1547 neu 1548 neu 1549 pos 1550 pos 1551 neu 1552 neu 1562 neu 1570 neu 1571 neg 1577 neu 1578 neu 1579 neu 1581 neu 1584 neg 1586 neu Name: sentiment, dtype: object	0.187500	0.125000	0.687500
153	2023-05-21	1587 neg 1589 neu 1590 neu 1592 neu 1593 neg 1594 neg 1595 neu 1596 neg 1597 neu 1598 neu 1599 neu 1600 neu 1601 pos 1602 neu 1603 neu 1604 neu 1605 neu 1606 neu 1608 neu 1609 neg 1610 neu 1611 neg 1614 neg 1615 pos 1616 neu 1619 neu 1620 neu 1621 neu 1623 pos 1631 neu 1632 neu 1633 neu Name: sentiment, dtype: object	0.093750	0.218750	0.687500
154	2023-05-28	1634 neu 1638 pos 1640 pos 1641 neu 1642 neu 1643 neg 1646 neu 1647 neu 1648 neu 1649 neg 1650 neu 1658 neu 1659 neg 1666 neu 1667 neu 1668 neu 1669 neu 1670 neu 1671 neg 1672 neu 1673 neg 1674 neg 1676 pos Name: sentiment, dtype: object	0.130435	0.260870	0.608696
155	2023-06-04	1677 pos 1679 neg 1680 neu 1682 neu 1683 neg 1685 neu 1692 neu 1694 neg 1696 neg 1698 pos 1699 pos 1700 pos 1704 pos 1705 neu 1706 neg 1708 neu 1710 pos 1711 neu 1713 neu 1716 neu 1718 neu 1719 neg 1721 neu 1722 neu 1724 neu Name: sentiment, dtype: object	0.240000	0.240000	0.520000
156	2023-06-11	1725 pos 1726 neu 1727 neu 1728 neu 1731 neg 1732 neu 1733 neg 1734 neu 1735 neu 1736 neg 1737 neg Name: sentiment, dtype: object	0.090909	0.363636	0.545455

157 rows × 5 columns

news_weight.drop('Data', inplace=True, axis=1)
news_weight

	Date	news_pos	news_neg	news_neu
0	2020-06-14	1.000000	0.000000	0.000000
1	2020-06-21	0.333333	0.000000	0.666667
2	2020-06-28	0.000000	1.000000	0.000000
3	2020-07-05	0.000000	0.166667	0.833333
4	2020-07-12	0.000000	0.000000	1.000000
...	...	...	...	...
152	2023-05-14	0.187500	0.125000	0.687500
153	2023-05-21	0.093750	0.218750	0.687500
154	2023-05-28	0.130435	0.260870	0.608696
155	2023-06-04	0.240000	0.240000	0.520000
156	2023-06-11	0.090909	0.363636	0.545455

157 rows × 4 columns

news_weight.plot(x='Date', y=['news_pos','news_neg','news_neu'], title='News Sentiment')

<AxesSubplot: title={'center': 'News Sentiment'}, xlabel='Date'>

News were also mostly neutral in tone. Again, there is a negative correlation between positive and negative tweets.

Combining Historical, Sentiment and Trends Data

df_combined.drop('ticker', inplace=True, axis=1)
df_combined.head()

	open	high	low	close	adjclose	volume	TwoWeeks	Month	Home Depot
2020-07-06	250.270004	251.500000	247.039993	249.550003	232.537186	3133800.0	247.481999	248.1370	90.0
2020-07-07	247.369995	250.779999	247.070007	247.350006	230.487167	2927800.0	247.300999	247.6660	90.0
2020-07-08	247.869995	249.789993	246.220001	249.169998	232.183075	2294000.0	247.187000	247.2865	90.0
2020-07-09	249.660004	250.509995	246.350006	247.960007	231.055588	2994700.0	247.370000	246.9620	90.0
2020-07-10	248.289993	250.330002	246.639999	250.110001	233.058990	2745300.0	247.842999	247.4940	90.0

df_combined = df_combined.reset_index()

df_combined.rename(columns={'index':'Date'}, inplace=True)
df_combined.head()

	Date	open	high	low	close	adjclose	volume	TwoWeeks	Month	Home Depot
0	2020-07-06	250.270004	251.500000	247.039993	249.550003	232.537186	3133800.0	247.481999	248.1370	90.0
1	2020-07-07	247.369995	250.779999	247.070007	247.350006	230.487167	2927800.0	247.300999	247.6660	90.0
2	2020-07-08	247.869995	249.789993	246.220001	249.169998	232.183075	2294000.0	247.187000	247.2865	90.0
3	2020-07-09	249.660004	250.509995	246.350006	247.960007	231.055588	2994700.0	247.370000	246.9620	90.0
4	2020-07-10	248.289993	250.330002	246.639999	250.110001	233.058990	2745300.0	247.842999	247.4940	90.0

import numpy as np
df_weekly = pd.DataFrame(df_combined.groupby([pd.Grouper(key='Date', freq='W')]).agg(np.mean))
df_weekly.head()

	open	high	low	close	adjclose	volume	TwoWeeks	Month	Home Depot
Date
2020-07-12	248.691998	250.581998	246.664001	248.828003	231.864401	2819120.0	247.436599	247.509100	90.0
2020-07-19	255.570004	259.695996	253.284000	256.733997	239.231400	3799580.0	250.709800	249.001500	86.0
2020-07-26	263.217999	265.070001	261.105994	263.376007	245.420612	2661120.0	256.946602	252.191601	85.0
2020-08-02	265.357996	267.876001	263.217999	265.832001	247.709158	2700400.0	263.277204	256.993502	85.0
2020-08-09	267.792004	270.044006	266.348004	268.508002	250.202713	2319420.0	266.128600	261.537601	80.0

df_final = df_weekly.merge(news_weight, on='Date').merge(tweets_weight, on='Date')
df_final.columns = [i.lower() for i in df_final.columns]
df_final.head()

	date	open	high	low	close	adjclose	volume	twoweeks	month	home depot	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
0	2020-07-12	248.691998	250.581998	246.664001	248.828003	231.864401	2819120.0	247.436599	247.509100	90.0	0.000000	0.00	1.000000	0.000000	0.166667	0.833333
1	2020-07-19	255.570004	259.695996	253.284000	256.733997	239.231400	3799580.0	250.709800	249.001500	86.0	0.000000	0.75	0.250000	0.000000	0.400000	0.600000
2	2020-07-26	263.217999	265.070001	261.105994	263.376007	245.420612	2661120.0	256.946602	252.191601	85.0	0.333333	0.00	0.666667	0.125000	0.125000	0.750000
3	2020-08-02	265.357996	267.876001	263.217999	265.832001	247.709158	2700400.0	263.277204	256.993502	85.0	0.200000	0.20	0.600000	0.181818	0.272727	0.545455
4	2020-08-09	267.792004	270.044006	266.348004	268.508002	250.202713	2319420.0	266.128600	261.537601	80.0	0.333333	0.00	0.666667	0.333333	0.222222	0.444444

df_final.rename(columns={"home depot":"trends"}, inplace=True)
df_final.head()

	date	open	high	low	close	adjclose	volume	twoweeks	month	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
0	2020-07-12	248.691998	250.581998	246.664001	248.828003	231.864401	2819120.0	247.436599	247.509100	90.0	0.000000	0.00	1.000000	0.000000	0.166667	0.833333
1	2020-07-19	255.570004	259.695996	253.284000	256.733997	239.231400	3799580.0	250.709800	249.001500	86.0	0.000000	0.75	0.250000	0.000000	0.400000	0.600000
2	2020-07-26	263.217999	265.070001	261.105994	263.376007	245.420612	2661120.0	256.946602	252.191601	85.0	0.333333	0.00	0.666667	0.125000	0.125000	0.750000
3	2020-08-02	265.357996	267.876001	263.217999	265.832001	247.709158	2700400.0	263.277204	256.993502	85.0	0.200000	0.20	0.600000	0.181818	0.272727	0.545455
4	2020-08-09	267.792004	270.044006	266.348004	268.508002	250.202713	2319420.0	266.128600	261.537601	80.0	0.333333	0.00	0.666667	0.333333	0.222222	0.444444

#Scaling the data
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()
df_final_scaled = pd.DataFrame(SS.fit_transform(df_final.iloc[:,1:]), columns=df_final.columns[1:])
df_final_scaled.insert(0, "date", df_final['date'])
df_final_scaled.head()

	date	open	high	low	close	adjclose	volume	twoweeks	month	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
0	2020-07-12	-1.703818	-1.730261	-1.682950	-1.705764	-1.872820	-0.840392	-1.738792	-1.736071	2.580289	-1.106820	-1.193801	1.766450	-1.503002	-0.103907	1.464049
1	2020-07-19	-1.505790	-1.470011	-1.490960	-1.477822	-1.658905	-0.124160	-1.644438	-1.692888	2.198301	-1.106820	3.504942	-1.899286	-1.503002	1.212374	0.306837
2	2020-07-26	-1.285591	-1.316556	-1.264110	-1.286323	-1.479190	-0.955812	-1.464655	-1.600581	2.102805	1.052492	-1.193801	0.137234	-0.824220	-0.338957	1.050759
3	2020-08-02	-1.223978	-1.236431	-1.202859	-1.215514	-1.412737	-0.927118	-1.282168	-1.461636	2.102805	0.188767	0.059197	-0.188609	-0.515683	0.494403	0.036320
4	2020-08-09	-1.153899	-1.174523	-1.112084	-1.138360	-1.340332	-1.205426	-1.199974	-1.330150	1.625320	1.052492	-1.193801	0.137234	0.307084	0.209494	-0.464638

df_final_scaled.corr().round(3)

<ipython-input-214-f6a92ee3555b>:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df_final_scaled.corr().round(3)

	open	high	low	close	adjclose	volume	twoweeks	month	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
open	1.000	0.999	0.999	0.998	0.986	-0.051	0.983	0.954	-0.221	0.241	-0.043	-0.148	0.009	-0.075	0.058
high	0.999	1.000	0.998	0.999	0.988	-0.040	0.982	0.954	-0.227	0.246	-0.038	-0.156	0.010	-0.074	0.056
low	0.999	0.998	1.000	0.999	0.985	-0.075	0.978	0.947	-0.210	0.238	-0.045	-0.145	0.008	-0.079	0.062
close	0.998	0.999	0.999	1.000	0.988	-0.059	0.978	0.949	-0.218	0.245	-0.041	-0.152	0.009	-0.077	0.059
adjclose	0.986	0.988	0.985	0.988	1.000	-0.042	0.970	0.945	-0.321	0.267	-0.013	-0.192	0.011	-0.072	0.053
volume	-0.051	-0.040	-0.075	-0.059	-0.042	1.000	0.018	0.058	-0.057	0.048	-0.005	-0.032	0.014	0.090	-0.092
twoweeks	0.983	0.982	0.978	0.978	0.970	0.018	1.000	0.986	-0.266	0.248	-0.035	-0.160	0.006	-0.077	0.062
month	0.954	0.954	0.947	0.949	0.945	0.058	0.986	1.000	-0.311	0.229	-0.034	-0.146	-0.003	-0.074	0.068
trends	-0.221	-0.227	-0.210	-0.218	-0.321	-0.057	-0.266	-0.311	1.000	-0.227	-0.072	0.227	-0.015	0.006	0.008
news_pos	0.241	0.246	0.238	0.245	0.267	0.048	0.248	0.229	-0.227	1.000	-0.151	-0.637	-0.074	0.002	0.066
news_neg	-0.043	-0.038	-0.045	-0.041	-0.013	-0.005	-0.035	-0.034	-0.072	-0.151	1.000	-0.666	-0.004	-0.005	0.008
news_neu	-0.148	-0.156	-0.145	-0.152	-0.192	-0.032	-0.160	-0.146	0.227	-0.637	-0.666	1.000	0.059	0.003	-0.056
tweet_pos	0.009	0.010	0.008	0.009	0.011	0.014	0.006	-0.003	-0.015	-0.074	-0.004	0.059	1.000	-0.378	-0.581
tweet_neg	-0.075	-0.074	-0.079	-0.077	-0.072	0.090	-0.077	-0.074	0.006	0.002	-0.005	0.003	-0.378	1.000	-0.534
tweet_neu	0.058	0.056	0.062	0.059	0.053	-0.092	0.062	0.068	0.008	0.066	0.008	-0.056	-0.581	-0.534	1.000

There is a positive correlation between positive news and the stock price in two weeks. We don't see this correlation with positive tweets or any other sentiment columns. Additionally, there is a negative correlation between trends and the stock price in two weeks. This means when people start searching Home Depot on Google News, stock prices starting falling in two weeks.

cols=['news_pos', 'news_neg','news_neu', 'tweet_pos', 'tweet_neg','tweet_neu', 'trends']
fig, ax =plt.subplots(3,3, figsize=(15, 12))
ax=ax.ravel()
for i in range(len(cols)):
    df_final_scaled.plot(x='date', y=['close','twoweeks','month', cols[i]], ax=ax[i], title=cols[i])
    ax[7].axis('off')
    ax[8].axis('off')
plt.tight_layout()

\We can observe the correlation mentioned above in the plots as well. The first plot, representing positive news, indicates that positive news loosely follows the stock price. On the other hand, the last plot depicting trends shows a negative correlation between Google searches and the stock price. Furthermore, over time, the number of Google searches for "Home Depot" has steadily declined.

Applying Linear Regression on Historical Price Data:

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use("ggplot")

df_final_scaled = pd.read_csv('df_final_home_depot_scaled.csv')
df_final_scaled.drop('Unnamed: 0', inplace=True, axis=1)
df_final_scaled.head()

	date	open	high	low	close	adjclose	volume	twoweeks	month	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
0	2020-07-12	-1.703818	-1.730261	-1.682950	-1.705764	-1.872820	-0.840392	-1.738792	-1.736071	2.580289	-1.106820	-1.193801	1.766450	-1.503002	-0.103907	1.464049
1	2020-07-19	-1.505790	-1.470011	-1.490960	-1.477822	-1.658905	-0.124160	-1.644438	-1.692888	2.198301	-1.106820	3.504942	-1.899286	-1.503002	1.212374	0.306837
2	2020-07-26	-1.285591	-1.316556	-1.264110	-1.286323	-1.479190	-0.955812	-1.464655	-1.600581	2.102805	1.052492	-1.193801	0.137234	-0.824220	-0.338957	1.050759
3	2020-08-02	-1.223978	-1.236431	-1.202859	-1.215514	-1.412737	-0.927118	-1.282168	-1.461636	2.102805	0.188767	0.059197	-0.188609	-0.515683	0.494403	0.036320
4	2020-08-09	-1.153899	-1.174523	-1.112084	-1.138360	-1.340332	-1.205426	-1.199974	-1.330150	1.625320	1.052492	-1.193801	0.137234	0.307084	0.209494	-0.464638

x= df_final_scaled[['open','high','low','close', 'adjclose','volume','news_pos', 'news_neg', 'news_neu', 'tweet_pos',
       'tweet_neg', 'tweet_neu', 'trends']]
y=df_final_scaled['twoweeks']

import statsmodels.api as sm
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               twoweeks   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.975
Method:                 Least Squares   F-statistic:                     537.8
Date:                Mon, 12 Jun 2023   Prob (F-statistic):          3.14e-109
Time:                        02:36:26   Log-Likelihood:                 70.561
No. Observations:                 153   AIC:                            -117.1
Df Residuals:                     141   BIC:                            -80.76
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.501e-16      0.013  -1.17e-14      1.000      -0.025       0.025
open           2.3468      0.511      4.594      0.000       1.337       3.357
high          -0.2428      0.541     -0.448      0.655      -1.313       0.828
low           -0.7347      0.559     -1.314      0.191      -1.840       0.371
close         -0.2749      0.545     -0.504      0.615      -1.352       0.802
adjclose      -0.1252      0.118     -1.064      0.289      -0.358       0.107
volume         0.0494      0.017      2.981      0.003       0.017       0.082
news_pos       0.0014      0.010      0.141      0.888      -0.019       0.022
news_neg       0.0043      0.010      0.438      0.662      -0.015       0.023
news_neu      -0.0044      0.007     -0.594      0.554      -0.019       0.010
tweet_pos     -0.0020      0.009     -0.229      0.819      -0.019       0.015
tweet_neg     -0.0071      0.009     -0.778      0.438      -0.025       0.011
tweet_neu      0.0081      0.008      1.009      0.315      -0.008       0.024
trends        -0.0524      0.019     -2.785      0.006      -0.090      -0.015
==============================================================================
Omnibus:                        1.461   Durbin-Watson:                   1.694
Prob(Omnibus):                  0.482   Jarque-Bera (JB):                1.534
Skew:                           0.193   Prob(JB):                        0.464
Kurtosis:                       2.697   Cond. No.                     5.93e+15
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.26e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

x= df_final_scaled[['open','high','low','close', 'adjclose','volume']]
y=df_final_scaled['twoweeks']

from sklearn.model_selection import train_test_split 
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3 , shuffle=False,random_state = 0)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
regression = LinearRegression()
regression.fit(train_x, train_y)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)

regression coefficient [ 2.15513818  0.1568129  -0.63183193 -0.94351993  0.2494587   0.05989017]
regression intercept 0.017827179785784394

regression_confidence = regression.score(test_x, test_y)
print("linear regression confidence: ", regression_confidence)

linear regression confidence:  0.8354280496114086

predicted=regression.predict(test_x)
print(test_x.head())

         open      high       low     close  adjclose    volume
107 -0.243796 -0.255966 -0.239776 -0.208430 -0.104242 -0.699127
108 -0.115385 -0.079953 -0.082182 -0.058794  0.042456 -0.785429
109  0.099918  0.070874  0.110736  0.082941  0.181410 -0.899154
110  0.366759  0.437578  0.380624  0.428573  0.520257  0.542134
111  0.080282  0.039007  0.007433 -0.028003  0.072644 -0.737113

dfr=pd.DataFrame({'Actual_Price':test_y, 'Predicted_Price':predicted})
dfr.head(10)

	Actual_Price	Predicted_Price
107	-0.251375	-0.267445
108	-0.142925	-0.172431
109	-0.051125	0.087459
110	0.166720	0.394254
111	0.250333	0.192664
112	-0.043019	-0.309534
113	-0.333535	-0.397486
114	-0.476311	-0.423891
115	-0.740981	-0.954391
116	-0.992016	-0.840699

dfr.describe()

	Actual_Price	Predicted_Price
count	46.000000	46.000000
mean	-0.191756	-0.132461
std	0.438800	0.433547
min	-0.992016	-0.954391
25%	-0.502699	-0.417290
50%	-0.331794	-0.271980
75%	0.274308	0.247046
max	0.478988	0.858482

from sklearn import metrics
import numpy as np
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test_y, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(test_y, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(test_y, predicted)))

Mean Absolute Error (MAE): 0.14249150773348154
Mean Squared Error (MSE) : 0.030998755667901733
Root Mean Squared Error (RMSE): 0.17606463491542454

plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()

plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Prediction using only Historical Data")
plt.legend()

<matplotlib.legend.Legend at 0x7fc3b62ce770>

Applying Linear Regression to both historical and newly added features:

df_final_scaled.columns

Index(['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
       'twoweeks', 'month', 'trends', 'news_pos', 'news_neg', 'news_neu',
       'tweet_pos', 'tweet_neg', 'tweet_neu'],
      dtype='object')

x= df_final_scaled[['open','high','low','close', 'adjclose','volume','news_pos', 'news_neg', 'news_neu', 'tweet_pos',
       'tweet_neg', 'tweet_neu', 'trends']]

y=df_final_scaled['twoweeks']

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3 , shuffle=False,random_state = 0)
regression = LinearRegression()
regression.fit(train_x, train_y)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)

regression coefficient [ 1.91871810e+00  1.89637622e-01 -3.49725608e-01 -8.35917172e-01
  4.63540880e-02  6.48403899e-02  1.45467759e-03  5.40953328e-03
 -5.31782650e-03 -3.13379351e-03 -7.57685954e-03  9.52332816e-03
 -5.55338621e-02]
regression intercept 0.023765631258188702

regression_confidence = regression.score(test_x, test_y)
print("linear regression confidence: ", regression_confidence)

linear regression confidence:  0.8294859517339385

predicted=regression.predict(test_x)
print(test_x.head())

         open      high       low     close  adjclose    volume  news_pos  \
107 -0.243796 -0.255966 -0.239776 -0.208430 -0.104242 -0.699127  0.332721   
108 -0.115385 -0.079953 -0.082182 -0.058794  0.042456 -0.785429 -1.106820   
109  0.099918  0.070874  0.110736  0.082941  0.181410 -0.899154  0.332721   
110  0.366759  0.437578  0.380624  0.428573  0.520257  0.542134  0.512664   
111  0.080282  0.039007  0.007433 -0.028003  0.072644 -0.737113  0.512664   

     news_neg  news_neu  tweet_pos  tweet_neg  tweet_neu    trends  
107  0.894529 -0.948910   0.533344   1.071344  -1.428981 -0.475612  
108  0.894529  0.137234   0.824251  -1.044107   0.165137 -0.380115  
109  0.894529 -0.948910  -1.503002   0.567665   0.873634 -0.666606  
110 -0.802239  0.239060   0.471637  -0.018434  -0.414542 -0.666606  
111  0.372447 -0.677374   1.890908  -0.338957  -1.428981 -0.475612

from sklearn.metrics import r2_score
r2_score(predicted, test_y)

0.8290817124619406

dfr=pd.DataFrame({'Actual_Price':test_y, 'Predicted_Price':predicted})
dfr.head(10)

	Actual_Price	Predicted_Price
107	-0.251375	-0.271245
108	-0.142925	-0.153351
109	-0.051125	0.127086
110	0.166720	0.405224
111	0.250333	0.177392
112	-0.043019	-0.300313
113	-0.333535	-0.381184
114	-0.476311	-0.385414
115	-0.740981	-0.939092
116	-0.992016	-0.805985

dfr.describe()

	Actual_Price	Predicted_Price
count	46.000000	46.000000
mean	-0.191756	-0.112709
std	0.438800	0.438281
min	-0.992016	-0.939092
25%	-0.502699	-0.419594
50%	-0.331794	-0.269632
75%	0.274308	0.268095
max	0.478988	0.895990

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test_y, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(test_y, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(test_y, predicted)))

Mean Absolute Error (MAE): 0.14758907019543632
Mean Squared Error (MSE) : 0.0321180086136408
Root Mean Squared Error (RMSE): 0.17921497876472492

plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()

plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Predictions using  Historical, Sentiment and Trends")
plt.legend()

<matplotlib.legend.Legend at 0x7fc3b6000850>

Applying CNN+LSTM on Historical Data:

import pandas as pd
df = pd.read_csv('df_final_home_depot_scaled.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)
df.head()

	date	open	high	low	close	adjclose	volume	twoweeks	month	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
0	2020-07-12	-1.703818	-1.730261	-1.682950	-1.705764	-1.872820	-0.840392	-1.738792	-1.736071	2.580289	-1.106820	-1.193801	1.766450	-1.503002	-0.103907	1.464049
1	2020-07-19	-1.505790	-1.470011	-1.490960	-1.477822	-1.658905	-0.124160	-1.644438	-1.692888	2.198301	-1.106820	3.504942	-1.899286	-1.503002	1.212374	0.306837
2	2020-07-26	-1.285591	-1.316556	-1.264110	-1.286323	-1.479190	-0.955812	-1.464655	-1.600581	2.102805	1.052492	-1.193801	0.137234	-0.824220	-0.338957	1.050759
3	2020-08-02	-1.223978	-1.236431	-1.202859	-1.215514	-1.412737	-0.927118	-1.282168	-1.461636	2.102805	0.188767	0.059197	-0.188609	-0.515683	0.494403	0.036320
4	2020-08-09	-1.153899	-1.174523	-1.112084	-1.138360	-1.340332	-1.205426	-1.199974	-1.330150	1.625320	1.052492	-1.193801	0.137234	0.307084	0.209494	-0.464638

df.columns

Index(['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
       'twoweeks', 'month', 'trends', 'news_pos', 'news_neg', 'news_neu',
       'tweet_pos', 'tweet_neg', 'tweet_neu'],
      dtype='object')

df_hist = df[['date', 'open', 'high', 'low', 'close', 'adjclose', 'volume',
       'twoweeks', 'month']]
df_hist.shape

(153, 9)

#Append each row to X (high, low, open, close, adjclose, volume)
X = [[df_hist.iloc[j,i+1] for i in range(df_hist.shape[1]-3)] for j in range(df_hist.shape[0])]
#Append all "twoweeks" values
Y = [df_hist.iloc[i,7] for i in range(df_hist.shape[0])]

import numpy as np
from sklearn.model_selection import train_test_split 


x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)

train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)
train_X = train_X.reshape(train_X.shape[0],1,6,1)
test_X = test_X.reshape(test_X.shape[0],1,6,1)
print(len(train_X))
print(len(test_X))

107
46

train_X[0]

array([[[-1.09683374],
        [-1.10216509],
        [-1.12420655],
        [-1.11575665],
        [-1.28069411],
        [-0.48313096]]])

from tensorflow.keras import backend as K
K.image_data_format()=="channels_first"

False

# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError

model = tf.keras.Sequential()

# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None, 6, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
# model.add(Dense(5, kernel_regularizer=L2(0.01)))

# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))

#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)

Epoch 1/40
3/3 [==============================] - 13s 910ms/step - loss: 0.8740 - mse: 0.8740 - mae: 0.7390 - val_loss: 1.1876 - val_mse: 1.1876 - val_mae: 0.8232
Epoch 2/40
3/3 [==============================] - 0s 34ms/step - loss: 0.8003 - mse: 0.8003 - mae: 0.7047 - val_loss: 1.0375 - val_mse: 1.0375 - val_mae: 0.7627
Epoch 3/40
3/3 [==============================] - 0s 38ms/step - loss: 0.6692 - mse: 0.6692 - mae: 0.6315 - val_loss: 0.7258 - val_mse: 0.7258 - val_mae: 0.6202
Epoch 4/40
3/3 [==============================] - 0s 42ms/step - loss: 0.3853 - mse: 0.3853 - mae: 0.4627 - val_loss: 0.2971 - val_mse: 0.2971 - val_mae: 0.3433
Epoch 5/40
3/3 [==============================] - 0s 41ms/step - loss: 0.1832 - mse: 0.1832 - mae: 0.2912 - val_loss: 0.1127 - val_mse: 0.1127 - val_mae: 0.2656
Epoch 6/40
3/3 [==============================] - 0s 46ms/step - loss: 0.1054 - mse: 0.1054 - mae: 0.2555 - val_loss: 0.2310 - val_mse: 0.2310 - val_mae: 0.3723
Epoch 7/40
3/3 [==============================] - 0s 67ms/step - loss: 0.1590 - mse: 0.1590 - mae: 0.3158 - val_loss: 0.1858 - val_mse: 0.1858 - val_mae: 0.3358
Epoch 8/40
3/3 [==============================] - 0s 60ms/step - loss: 0.1221 - mse: 0.1221 - mae: 0.2605 - val_loss: 0.0740 - val_mse: 0.0740 - val_mae: 0.2339
Epoch 9/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0769 - mse: 0.0769 - mae: 0.2229 - val_loss: 0.0902 - val_mse: 0.0902 - val_mae: 0.2187
Epoch 10/40
3/3 [==============================] - 0s 81ms/step - loss: 0.0740 - mse: 0.0740 - mae: 0.1926 - val_loss: 0.1062 - val_mse: 0.1062 - val_mae: 0.2100
Epoch 11/40
3/3 [==============================] - 0s 64ms/step - loss: 0.0807 - mse: 0.0807 - mae: 0.1986 - val_loss: 0.0893 - val_mse: 0.0893 - val_mae: 0.2025
Epoch 12/40
3/3 [==============================] - 0s 61ms/step - loss: 0.0891 - mse: 0.0891 - mae: 0.2086 - val_loss: 0.0575 - val_mse: 0.0575 - val_mae: 0.1793
Epoch 13/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0599 - mse: 0.0599 - mae: 0.1769 - val_loss: 0.0513 - val_mse: 0.0513 - val_mae: 0.1723
Epoch 14/40
3/3 [==============================] - 0s 65ms/step - loss: 0.0515 - mse: 0.0515 - mae: 0.1749 - val_loss: 0.0740 - val_mse: 0.0740 - val_mae: 0.2093
Epoch 15/40
3/3 [==============================] - 0s 57ms/step - loss: 0.0550 - mse: 0.0550 - mae: 0.1783 - val_loss: 0.0788 - val_mse: 0.0788 - val_mae: 0.2178
Epoch 16/40
3/3 [==============================] - 0s 56ms/step - loss: 0.0655 - mse: 0.0655 - mae: 0.1941 - val_loss: 0.0548 - val_mse: 0.0548 - val_mae: 0.1837
Epoch 17/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0431 - mse: 0.0431 - mae: 0.1603 - val_loss: 0.0412 - val_mse: 0.0412 - val_mae: 0.1610
Epoch 18/40
3/3 [==============================] - 0s 57ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1512 - val_loss: 0.0473 - val_mse: 0.0473 - val_mae: 0.1615
Epoch 19/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0426 - mse: 0.0426 - mae: 0.1599 - val_loss: 0.0502 - val_mse: 0.0502 - val_mae: 0.1643
Epoch 20/40
3/3 [==============================] - 0s 61ms/step - loss: 0.0579 - mse: 0.0579 - mae: 0.1759 - val_loss: 0.0423 - val_mse: 0.0423 - val_mae: 0.1570
Epoch 21/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0507 - mse: 0.0507 - mae: 0.1801 - val_loss: 0.0473 - val_mse: 0.0473 - val_mae: 0.1655
Epoch 22/40
3/3 [==============================] - 0s 71ms/step - loss: 0.0399 - mse: 0.0399 - mae: 0.1639 - val_loss: 0.0588 - val_mse: 0.0588 - val_mae: 0.1883
Epoch 23/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0468 - mse: 0.0468 - mae: 0.1620 - val_loss: 0.0428 - val_mse: 0.0428 - val_mae: 0.1614
Epoch 24/40
3/3 [==============================] - 0s 35ms/step - loss: 0.0417 - mse: 0.0417 - mae: 0.1593 - val_loss: 0.0419 - val_mse: 0.0419 - val_mae: 0.1566
Epoch 25/40
3/3 [==============================] - 0s 40ms/step - loss: 0.0481 - mse: 0.0481 - mae: 0.1619 - val_loss: 0.0418 - val_mse: 0.0418 - val_mae: 0.1554
Epoch 26/40
3/3 [==============================] - 0s 40ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1476 - val_loss: 0.0420 - val_mse: 0.0420 - val_mae: 0.1560
Epoch 27/40
3/3 [==============================] - 0s 38ms/step - loss: 0.0471 - mse: 0.0471 - mae: 0.1628 - val_loss: 0.0429 - val_mse: 0.0429 - val_mae: 0.1625
Epoch 28/40
3/3 [==============================] - 0s 35ms/step - loss: 0.0461 - mse: 0.0461 - mae: 0.1648 - val_loss: 0.0434 - val_mse: 0.0434 - val_mae: 0.1660
Epoch 29/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0360 - mse: 0.0360 - mae: 0.1533 - val_loss: 0.0427 - val_mse: 0.0427 - val_mae: 0.1616
Epoch 30/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0376 - mse: 0.0376 - mae: 0.1511 - val_loss: 0.0431 - val_mse: 0.0431 - val_mae: 0.1578
Epoch 31/40
3/3 [==============================] - 0s 37ms/step - loss: 0.0435 - mse: 0.0435 - mae: 0.1703 - val_loss: 0.0448 - val_mse: 0.0448 - val_mae: 0.1599
Epoch 32/40
3/3 [==============================] - 0s 39ms/step - loss: 0.0414 - mse: 0.0414 - mae: 0.1560 - val_loss: 0.0459 - val_mse: 0.0459 - val_mae: 0.1602
Epoch 33/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0437 - mse: 0.0437 - mae: 0.1640 - val_loss: 0.0477 - val_mse: 0.0477 - val_mae: 0.1672
Epoch 34/40
3/3 [==============================] - 0s 43ms/step - loss: 0.0421 - mse: 0.0421 - mae: 0.1515 - val_loss: 0.0508 - val_mse: 0.0508 - val_mae: 0.1743
Epoch 35/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0427 - mse: 0.0427 - mae: 0.1623 - val_loss: 0.0461 - val_mse: 0.0461 - val_mae: 0.1658
Epoch 36/40
3/3 [==============================] - 0s 34ms/step - loss: 0.0444 - mse: 0.0444 - mae: 0.1668 - val_loss: 0.0426 - val_mse: 0.0426 - val_mae: 0.1588
Epoch 37/40
3/3 [==============================] - 0s 38ms/step - loss: 0.0468 - mse: 0.0468 - mae: 0.1661 - val_loss: 0.0496 - val_mse: 0.0496 - val_mae: 0.1661
Epoch 38/40
3/3 [==============================] - 0s 38ms/step - loss: 0.0431 - mse: 0.0431 - mae: 0.1544 - val_loss: 0.0435 - val_mse: 0.0435 - val_mae: 0.1588
Epoch 39/40
3/3 [==============================] - 0s 36ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1587 - val_loss: 0.0440 - val_mse: 0.0440 - val_mae: 0.1577
Epoch 40/40
3/3 [==============================] - 0s 34ms/step - loss: 0.0546 - mse: 0.0546 - mae: 0.1726 - val_loss: 0.0544 - val_mse: 0.0544 - val_mae: 0.1774

import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()

<matplotlib.legend.Legend at 0x7fc355192470>

plt.plot(history.history['mse'], label='train mse')
plt.plot(history.history['val_mse'], label='val mse')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()

<matplotlib.legend.Legend at 0x7fc355252b30>

plt.plot(history.history['mae'], label='train mae')
plt.plot(history.history['val_mae'], label='val mae')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()

<matplotlib.legend.Legend at 0x7fc355588a60>

model.evaluate(test_X, test_Y)

2/2 [==============================] - 0s 10ms/step - loss: 0.0544 - mse: 0.0544 - mae: 0.1774

[0.054426856338977814, 0.054426856338977814, 0.1773832142353058]

from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]

var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)

r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)

var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)

Variance: 0.962742
R2 Score: 0.962742
Max Error: 0.613890

predicted  = model.predict(test_X)
test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
    temp = df_hist.iloc[j,3]
    test_label[j - len_t] = test_label[j - len_t] * temp + temp
    predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted  Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()

2/2 [==============================] - 0s 7ms/step

Applying CNN+LSTM on Historical+News+Tweets+Trends data:

df.head()

	date	open	high	low	close	adjclose	volume	twoweeks	month	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
0	2020-07-12	-1.703818	-1.730261	-1.682950	-1.705764	-1.872820	-0.840392	-1.738792	-1.736071	2.580289	-1.106820	-1.193801	1.766450	-1.503002	-0.103907	1.464049
1	2020-07-19	-1.505790	-1.470011	-1.490960	-1.477822	-1.658905	-0.124160	-1.644438	-1.692888	2.198301	-1.106820	3.504942	-1.899286	-1.503002	1.212374	0.306837
2	2020-07-26	-1.285591	-1.316556	-1.264110	-1.286323	-1.479190	-0.955812	-1.464655	-1.600581	2.102805	1.052492	-1.193801	0.137234	-0.824220	-0.338957	1.050759
3	2020-08-02	-1.223978	-1.236431	-1.202859	-1.215514	-1.412737	-0.927118	-1.282168	-1.461636	2.102805	0.188767	0.059197	-0.188609	-0.515683	0.494403	0.036320
4	2020-08-09	-1.153899	-1.174523	-1.112084	-1.138360	-1.340332	-1.205426	-1.199974	-1.330150	1.625320	1.052492	-1.193801	0.137234	0.307084	0.209494	-0.464638

df_x = df.drop(['date','twoweeks', 'month'], axis=1)
df_x.head()

	open	high	low	close	adjclose	volume	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
0	-1.703818	-1.730261	-1.682950	-1.705764	-1.872820	-0.840392	2.580289	-1.106820	-1.193801	1.766450	-1.503002	-0.103907	1.464049
1	-1.505790	-1.470011	-1.490960	-1.477822	-1.658905	-0.124160	2.198301	-1.106820	3.504942	-1.899286	-1.503002	1.212374	0.306837
2	-1.285591	-1.316556	-1.264110	-1.286323	-1.479190	-0.955812	2.102805	1.052492	-1.193801	0.137234	-0.824220	-0.338957	1.050759
3	-1.223978	-1.236431	-1.202859	-1.215514	-1.412737	-0.927118	2.102805	0.188767	0.059197	-0.188609	-0.515683	0.494403	0.036320
4	-1.153899	-1.174523	-1.112084	-1.138360	-1.340332	-1.205426	1.625320	1.052492	-1.193801	0.137234	0.307084	0.209494	-0.464638

X = [[df_x.iloc[j, i] for i in range(df_x.shape[1])] for j in range(df_x.shape[0])]
Y = [df.iloc[i,7] for i in range(df.shape[0])] 
print(X[0:2])
print(Y[0:2])

[[-1.7038183261524975, -1.7302608232721683, -1.6829499939918715, -1.7057639670236395, -1.872820057485404, -0.8403922760048991, 2.580289089276951, -1.1068195873526132, -1.1938008487946932, 1.7664503220919647, -1.5030018624207688, -0.1039065146895656, 1.464048694140032], [-1.5057895100199, -1.470010607913253, -1.4909599654415322, -1.4778224513157197, -1.6589052190920777, -0.12416025602251, 2.198301444710552, -1.1068195873526132, 3.504942032500697, -1.8992863327148637, -1.5030018624207688, 1.2123738405060045, 0.3068368015117597]]
[-1.738791872212694, -1.6444381302921762]


x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)

train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)

train_X = train_X.reshape(train_X.shape[0],1,13,1)
test_X = test_X.reshape(test_X.shape[0],1,13,1)

print(len(train_X))
print(len(test_X))

107
46

# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError

model = tf.keras.Sequential()

# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None,13, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
#model.add(Dense(5, kernel_regularizer=L2(0.01)))

# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))

#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)

Epoch 1/40
3/3 [==============================] - 14s 1s/step - loss: 1.1132 - mse: 1.1132 - mae: 0.7954 - val_loss: 0.6801 - val_mse: 0.6801 - val_mae: 0.7047
Epoch 2/40
3/3 [==============================] - 0s 47ms/step - loss: 1.0145 - mse: 1.0145 - mae: 0.7659 - val_loss: 0.5992 - val_mse: 0.5992 - val_mae: 0.6581
Epoch 3/40
3/3 [==============================] - 0s 49ms/step - loss: 0.8404 - mse: 0.8404 - mae: 0.7018 - val_loss: 0.3991 - val_mse: 0.3991 - val_mae: 0.5249
Epoch 4/40
3/3 [==============================] - 0s 49ms/step - loss: 0.4299 - mse: 0.4299 - mae: 0.5080 - val_loss: 0.0980 - val_mse: 0.0980 - val_mae: 0.2482
Epoch 5/40
3/3 [==============================] - 0s 49ms/step - loss: 0.1155 - mse: 0.1155 - mae: 0.2599 - val_loss: 0.3027 - val_mse: 0.3027 - val_mae: 0.4706
Epoch 6/40
3/3 [==============================] - 0s 46ms/step - loss: 0.2489 - mse: 0.2489 - mae: 0.4039 - val_loss: 0.1436 - val_mse: 0.1436 - val_mae: 0.3104
Epoch 7/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0943 - mse: 0.0943 - mae: 0.2432 - val_loss: 0.0345 - val_mse: 0.0345 - val_mae: 0.1563
Epoch 8/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0843 - mse: 0.0843 - mae: 0.2124 - val_loss: 0.0730 - val_mse: 0.0730 - val_mae: 0.2204
Epoch 9/40
3/3 [==============================] - 0s 43ms/step - loss: 0.1079 - mse: 0.1079 - mae: 0.2609 - val_loss: 0.0979 - val_mse: 0.0979 - val_mae: 0.2547
Epoch 10/40
3/3 [==============================] - 0s 50ms/step - loss: 0.0937 - mse: 0.0937 - mae: 0.2384 - val_loss: 0.0860 - val_mse: 0.0860 - val_mae: 0.2278
Epoch 11/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0645 - mse: 0.0645 - mae: 0.2031 - val_loss: 0.0639 - val_mse: 0.0639 - val_mae: 0.1807
Epoch 12/40
3/3 [==============================] - 0s 50ms/step - loss: 0.0624 - mse: 0.0624 - mae: 0.1883 - val_loss: 0.0604 - val_mse: 0.0604 - val_mae: 0.1955
Epoch 13/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0650 - mse: 0.0650 - mae: 0.1957 - val_loss: 0.0530 - val_mse: 0.0530 - val_mae: 0.1897
Epoch 14/40
3/3 [==============================] - 0s 58ms/step - loss: 0.0640 - mse: 0.0640 - mae: 0.1997 - val_loss: 0.0385 - val_mse: 0.0385 - val_mae: 0.1571
Epoch 15/40
3/3 [==============================] - 0s 47ms/step - loss: 0.0470 - mse: 0.0470 - mae: 0.1674 - val_loss: 0.0439 - val_mse: 0.0439 - val_mae: 0.1619
Epoch 16/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0499 - mse: 0.0499 - mae: 0.1750 - val_loss: 0.0491 - val_mse: 0.0491 - val_mae: 0.1778
Epoch 17/40
3/3 [==============================] - 0s 52ms/step - loss: 0.0638 - mse: 0.0638 - mae: 0.1926 - val_loss: 0.0398 - val_mse: 0.0398 - val_mae: 0.1613
Epoch 18/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0652 - mse: 0.0652 - mae: 0.1923 - val_loss: 0.0322 - val_mse: 0.0322 - val_mae: 0.1411
Epoch 19/40
3/3 [==============================] - 0s 57ms/step - loss: 0.0420 - mse: 0.0420 - mae: 0.1581 - val_loss: 0.0354 - val_mse: 0.0354 - val_mae: 0.1437
Epoch 20/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0519 - mse: 0.0519 - mae: 0.1740 - val_loss: 0.0377 - val_mse: 0.0377 - val_mae: 0.1469
Epoch 21/40
3/3 [==============================] - 0s 57ms/step - loss: 0.0496 - mse: 0.0496 - mae: 0.1688 - val_loss: 0.0380 - val_mse: 0.0380 - val_mae: 0.1489
Epoch 22/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0452 - mse: 0.0452 - mae: 0.1637 - val_loss: 0.0366 - val_mse: 0.0366 - val_mae: 0.1491
Epoch 23/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0508 - mse: 0.0508 - mae: 0.1720 - val_loss: 0.0406 - val_mse: 0.0406 - val_mae: 0.1534
Epoch 24/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0493 - mse: 0.0493 - mae: 0.1681 - val_loss: 0.0477 - val_mse: 0.0477 - val_mae: 0.1597
Epoch 25/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0461 - mse: 0.0461 - mae: 0.1680 - val_loss: 0.0416 - val_mse: 0.0416 - val_mae: 0.1569
Epoch 26/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0503 - mse: 0.0503 - mae: 0.1728 - val_loss: 0.0438 - val_mse: 0.0438 - val_mae: 0.1581
Epoch 27/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0651 - mse: 0.0651 - mae: 0.1787 - val_loss: 0.0394 - val_mse: 0.0394 - val_mae: 0.1576
Epoch 28/40
3/3 [==============================] - 0s 50ms/step - loss: 0.0557 - mse: 0.0557 - mae: 0.1833 - val_loss: 0.0397 - val_mse: 0.0397 - val_mae: 0.1562
Epoch 29/40
3/3 [==============================] - 0s 63ms/step - loss: 0.0469 - mse: 0.0469 - mae: 0.1637 - val_loss: 0.0443 - val_mse: 0.0443 - val_mae: 0.1593
Epoch 30/40
3/3 [==============================] - 0s 59ms/step - loss: 0.0419 - mse: 0.0419 - mae: 0.1584 - val_loss: 0.0377 - val_mse: 0.0377 - val_mae: 0.1574
Epoch 31/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0513 - mse: 0.0513 - mae: 0.1747 - val_loss: 0.0392 - val_mse: 0.0392 - val_mae: 0.1544
Epoch 32/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0414 - mse: 0.0414 - mae: 0.1523 - val_loss: 0.0548 - val_mse: 0.0548 - val_mae: 0.1641
Epoch 33/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0598 - mse: 0.0598 - mae: 0.1620 - val_loss: 0.0430 - val_mse: 0.0430 - val_mae: 0.1545
Epoch 34/40
3/3 [==============================] - 0s 55ms/step - loss: 0.0423 - mse: 0.0423 - mae: 0.1586 - val_loss: 0.0377 - val_mse: 0.0377 - val_mae: 0.1541
Epoch 35/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0558 - mse: 0.0558 - mae: 0.1727 - val_loss: 0.0396 - val_mse: 0.0396 - val_mae: 0.1559
Epoch 36/40
3/3 [==============================] - 0s 61ms/step - loss: 0.0518 - mse: 0.0518 - mae: 0.1639 - val_loss: 0.0411 - val_mse: 0.0411 - val_mae: 0.1554
Epoch 37/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0511 - mse: 0.0511 - mae: 0.1643 - val_loss: 0.0382 - val_mse: 0.0382 - val_mae: 0.1548
Epoch 38/40
3/3 [==============================] - 0s 52ms/step - loss: 0.0453 - mse: 0.0453 - mae: 0.1530 - val_loss: 0.0372 - val_mse: 0.0372 - val_mae: 0.1545
Epoch 39/40
3/3 [==============================] - 0s 55ms/step - loss: 0.0413 - mse: 0.0413 - mae: 0.1563 - val_loss: 0.0433 - val_mse: 0.0433 - val_mae: 0.1608
Epoch 40/40
3/3 [==============================] - 0s 56ms/step - loss: 0.0496 - mse: 0.0496 - mae: 0.1716 - val_loss: 0.0464 - val_mse: 0.0464 - val_mae: 0.1663

plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()

<matplotlib.legend.Legend at 0x7fc354e366b0>

model.evaluate(test_X, test_Y)

2/2 [==============================] - 0s 11ms/step - loss: 0.0464 - mse: 0.0464 - mae: 0.1663

[0.046359434723854065, 0.046359434723854065, 0.1662808209657669]

from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]

var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)

r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)

var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)

Variance: 0.938245
R2 Score: 0.938245
Max Error: 0.774743

predicted  = model.predict(test_X)

test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
    temp = df.iloc[j,3]
    test_label[j - len_t] = test_label[j - len_t] * temp + temp
    predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted  Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()

2/2 [==============================] - 0s 18ms/step

Clustering using K-Means

I will cluster the data points using K-Means and add that cluster information to the original data and run Linear Regression and CNN+LSTM to see if it will improve their performance. I need to reduce the dimension of the data using PCA.

df.corr().round(3)

<ipython-input-87-8faa7a8256e3>:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.corr().round(3)

	open	high	low	close	adjclose	volume	twoweeks	month	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
open	1.000	0.999	0.999	0.998	0.986	-0.051	0.983	0.954	-0.221	0.241	-0.043	-0.148	0.009	-0.075	0.058
high	0.999	1.000	0.998	0.999	0.988	-0.040	0.982	0.954	-0.227	0.246	-0.038	-0.156	0.010	-0.074	0.056
low	0.999	0.998	1.000	0.999	0.985	-0.075	0.978	0.947	-0.210	0.238	-0.045	-0.145	0.008	-0.079	0.062
close	0.998	0.999	0.999	1.000	0.988	-0.059	0.978	0.949	-0.218	0.245	-0.041	-0.152	0.009	-0.077	0.059
adjclose	0.986	0.988	0.985	0.988	1.000	-0.042	0.970	0.945	-0.321	0.267	-0.013	-0.192	0.011	-0.072	0.053
volume	-0.051	-0.040	-0.075	-0.059	-0.042	1.000	0.018	0.058	-0.057	0.048	-0.005	-0.032	0.014	0.090	-0.092
twoweeks	0.983	0.982	0.978	0.978	0.970	0.018	1.000	0.986	-0.266	0.248	-0.035	-0.160	0.006	-0.077	0.062
month	0.954	0.954	0.947	0.949	0.945	0.058	0.986	1.000	-0.311	0.229	-0.034	-0.146	-0.003	-0.074	0.068
trends	-0.221	-0.227	-0.210	-0.218	-0.321	-0.057	-0.266	-0.311	1.000	-0.227	-0.072	0.227	-0.015	0.006	0.008
news_pos	0.241	0.246	0.238	0.245	0.267	0.048	0.248	0.229	-0.227	1.000	-0.151	-0.637	-0.074	0.002	0.066
news_neg	-0.043	-0.038	-0.045	-0.041	-0.013	-0.005	-0.035	-0.034	-0.072	-0.151	1.000	-0.666	-0.004	-0.005	0.008
news_neu	-0.148	-0.156	-0.145	-0.152	-0.192	-0.032	-0.160	-0.146	0.227	-0.637	-0.666	1.000	0.059	0.003	-0.056
tweet_pos	0.009	0.010	0.008	0.009	0.011	0.014	0.006	-0.003	-0.015	-0.074	-0.004	0.059	1.000	-0.378	-0.581
tweet_neg	-0.075	-0.074	-0.079	-0.077	-0.072	0.090	-0.077	-0.074	0.006	0.002	-0.005	0.003	-0.378	1.000	-0.534
tweet_neu	0.058	0.056	0.062	0.059	0.053	-0.092	0.062	0.068	0.008	0.066	0.008	-0.056	-0.581	-0.534	1.000

import pandas as pd
df_final = pd.read_csv('df_final_home_depot_scaled.csv')
df_final.head()

	Unnamed: 0	date	open	high	low	close	adjclose	volume	twoweeks	month	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
0	0	2020-07-12	-1.703818	-1.730261	-1.682950	-1.705764	-1.872820	-0.840392	-1.738792	-1.736071	2.580289	-1.106820	-1.193801	1.766450	-1.503002	-0.103907	1.464049
1	1	2020-07-19	-1.505790	-1.470011	-1.490960	-1.477822	-1.658905	-0.124160	-1.644438	-1.692888	2.198301	-1.106820	3.504942	-1.899286	-1.503002	1.212374	0.306837
2	2	2020-07-26	-1.285591	-1.316556	-1.264110	-1.286323	-1.479190	-0.955812	-1.464655	-1.600581	2.102805	1.052492	-1.193801	0.137234	-0.824220	-0.338957	1.050759
3	3	2020-08-02	-1.223978	-1.236431	-1.202859	-1.215514	-1.412737	-0.927118	-1.282168	-1.461636	2.102805	0.188767	0.059197	-0.188609	-0.515683	0.494403	0.036320
4	4	2020-08-09	-1.153899	-1.174523	-1.112084	-1.138360	-1.340332	-1.205426	-1.199974	-1.330150	1.625320	1.052492	-1.193801	0.137234	0.307084	0.209494	-0.464638

df_final.columns

Index(['Unnamed: 0', 'date', 'open', 'high', 'low', 'close', 'adjclose',
       'volume', 'twoweeks', 'month', 'trends', 'news_pos', 'news_neg',
       'news_neu', 'tweet_pos', 'tweet_neg', 'tweet_neu'],
      dtype='object')

df_values = df_final[['open', 'high', 'low', 'close', 'adjclose',
       'volume', 'trends', 'news_pos', 'news_neg',
       'news_neu', 'tweet_pos', 'tweet_neg', 'tweet_neu']]
df_values.head()

	open	high	low	close	adjclose	volume	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu
0	-1.703818	-1.730261	-1.682950	-1.705764	-1.872820	-0.840392	2.580289	-1.106820	-1.193801	1.766450	-1.503002	-0.103907	1.464049
1	-1.505790	-1.470011	-1.490960	-1.477822	-1.658905	-0.124160	2.198301	-1.106820	3.504942	-1.899286	-1.503002	1.212374	0.306837
2	-1.285591	-1.316556	-1.264110	-1.286323	-1.479190	-0.955812	2.102805	1.052492	-1.193801	0.137234	-0.824220	-0.338957	1.050759
3	-1.223978	-1.236431	-1.202859	-1.215514	-1.412737	-0.927118	2.102805	0.188767	0.059197	-0.188609	-0.515683	0.494403	0.036320
4	-1.153899	-1.174523	-1.112084	-1.138360	-1.340332	-1.205426	1.625320	1.052492	-1.193801	0.137234	0.307084	0.209494	-0.464638

from sklearn.decomposition import PCA

pca = PCA(n_components=3)
reduced_df = pd.DataFrame(pca.fit_transform(df_values), columns=(["c1","c2", "c3"]))
reduced_df.head()

	c1	c2	c3
0	-4.406186	-1.618010	-2.783632
1	-3.608144	3.040917	-0.156988
2	-2.935629	0.057277	-1.898846
3	-2.966506	0.316674	-0.513015
4	-2.639892	-0.374056	-0.027751

import sklearn
from sklearn.cluster import KMeans
distances = []
k_values = range(2,10)
for cluster in k_values:
    kmeans = KMeans(n_clusters=cluster)
    kmeans.fit(reduced_df)
    distances.append(kmeans.inertia_)
plt.plot(k_values, distances,'bx-')
plt.xlabel('Number of clusters') 
plt.ylabel('Sum of Distances') 
plt.title('Elbow Method For Optimal Number of Clusters')
plt.show()

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

from yellowbrick.cluster import KElbowVisualizer
auto_elbow = KElbowVisualizer(KMeans(), k=10)
auto_elbow.fit(df_values)
auto_elbow.show()

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>

from sklearn.cluster import KMeans
km=KMeans(n_clusters=5)
clusters= km.fit_predict(reduced_df)
pca_clust = reduced_df.copy()
pca_clust['clusters']=clusters
pca_clust.head()

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

	c1	c2	c3	clusters
0	-4.406186	-1.618010	-2.783632	2
1	-3.608144	3.040917	-0.156988	4
2	-2.935629	0.057277	-1.898846	4
3	-2.966506	0.316674	-0.513015	4
4	-2.639892	-0.374056	-0.027751	2

df_values['cluster'] = clusters
df_values.head()

<ipython-input-95-7dac953dbd69>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_values['cluster'] = clusters

	open	high	low	close	adjclose	volume	trends	news_pos	news_neg	news_neu	tweet_pos	tweet_neg	tweet_neu	cluster
0	-1.703818	-1.730261	-1.682950	-1.705764	-1.872820	-0.840392	2.580289	-1.106820	-1.193801	1.766450	-1.503002	-0.103907	1.464049	2
1	-1.505790	-1.470011	-1.490960	-1.477822	-1.658905	-0.124160	2.198301	-1.106820	3.504942	-1.899286	-1.503002	1.212374	0.306837	4
2	-1.285591	-1.316556	-1.264110	-1.286323	-1.479190	-0.955812	2.102805	1.052492	-1.193801	0.137234	-0.824220	-0.338957	1.050759	4
3	-1.223978	-1.236431	-1.202859	-1.215514	-1.412737	-0.927118	2.102805	0.188767	0.059197	-0.188609	-0.515683	0.494403	0.036320	4
4	-1.153899	-1.174523	-1.112084	-1.138360	-1.340332	-1.205426	1.625320	1.052492	-1.193801	0.137234	0.307084	0.209494	-0.464638	2

Applying K-Means and Linear Regression on Historical Data:

df_x = df_values[['open', 'high', 'low', 'close', 'adjclose','volume', 'cluster']]

from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
x_train, x_test, y_train, y_test = train_test_split(df_x, df_final['twoweeks'], test_size=0.3, random_state=0)

regression = LinearRegression()
regression.fit(x_train, y_train)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)

regression coefficient [ 2.80045428 -0.37223382 -1.53280191  0.06410424  0.02119985  0.03065633
 -0.00540246]
regression intercept -0.012097080760859524

regression_confidence = regression.score(x_test, y_test)
print("linear regression confidence: ", regression_confidence)

linear regression confidence:  0.9476930471292958

predicted=regression.predict(x_test)
print(x_test.head())

         open      high       low     close  adjclose    volume  cluster
26  -1.222595 -1.220040 -1.189692 -1.194005 -1.314978  0.075531        4
135  0.423421  0.383837  0.416818  0.388613  0.595698 -0.751431        0
63   0.783028  0.758650  0.821157  0.777318  0.707787 -0.779556        1
105 -0.615611 -0.554651 -0.594871 -0.573033 -0.461688 -0.918600        4
24  -1.103859 -1.135303 -1.064246 -1.082658 -1.209359 -0.907303        2

dfr=pd.DataFrame({'Actual_Price':y_test, 'Predicted_Price':predicted}).reset_index()
dfr.drop(columns='index', inplace=True, axis=1)
dfr.head(10)

	Actual_Price	Predicted_Price
0	-1.159801	-1.281929
1	0.471989	0.406400
2	0.753759	0.675203
3	-0.716651	-0.714101
4	-1.131320	-1.183188
5	-0.671596	-0.561560
6	0.666571	0.697591
7	-0.405871	-0.754095
8	-0.043019	-0.375646
9	0.400488	0.381176

from sklearn import metrics
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(y_test, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, predicted)))

Mean Absolute Error (MAE): 0.1337945393494824
Mean Squared Error (MSE) : 0.031248325149281247
Root Mean Squared Error (RMSE): 0.17677195803995963

plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()

plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Predictions using  Historical")
plt.legend()

<matplotlib.legend.Legend at 0x7fc34d99c520>

Applying K-Means and Linear Regression on Historical + Sentiment + Trends Data

from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score
x_train, x_test, y_train, y_test = train_test_split(df_values, df_final['twoweeks'], test_size=0.3, random_state=0)

regression = LinearRegression()
regression.fit(x_train, y_train)
print("regression coefficient",regression.coef_)
print("regression intercept",regression.intercept_)

regression coefficient [ 2.66166755e+00 -3.53252621e-01 -1.39810203e+00  1.48573349e-01
 -8.49023102e-02  3.15909420e-02 -2.99197954e-02 -1.89951390e-03
 -6.61002963e-03  6.59002912e-03 -1.43476246e-03  8.87248181e-04
  5.30348639e-04 -2.67182404e-03]
regression intercept -0.018002014019515143

regression_confidence = regression.score(x_test, y_test)
print("linear regression confidence: ", regression_confidence)

linear regression confidence:  0.9493158696671368

predicted=regression.predict(x_test)
print(x_test.head())

         open      high       low     close  adjclose    volume    trends  \
26  -1.222595 -1.220040 -1.189692 -1.194005 -1.314978  0.075531  0.097369   
135  0.423421  0.383837  0.416818  0.388613  0.595698 -0.751431 -1.812569   
63   0.783028  0.758650  0.821157  0.777318  0.707787 -0.779556 -0.571109   
105 -0.615611 -0.554651 -0.594871 -0.573033 -0.461688 -0.918600 -0.189121   
24  -1.103859 -1.135303 -1.064246 -1.082658 -1.209359 -0.907303  0.192866   

     news_pos  news_neg  news_neu  tweet_pos  tweet_neg  tweet_neu  cluster  
26   0.512664  0.372447 -0.677374  -1.503002   0.366194   1.050759        4  
135 -1.106820  3.818192 -2.143669   2.117169  -1.044107  -1.015691        0  
63   0.188767  0.059197 -0.188609   2.569690  -1.044107  -1.428981        1  
105  1.052492  0.894529 -1.491982   0.126075   0.084134  -0.189111        4  
24  -1.106820 -1.193801  1.766450   0.824251   0.567665  -1.251857        2

dfr=pd.DataFrame({'Actual_Price':y_test, 'Predicted_Price':predicted}).reset_index()
dfr.drop(columns='index', inplace=True, axis=1)
dfr.head(10)

	Actual_Price	Predicted_Price
0	-1.159801	-1.259679
1	0.471989	0.386546
2	0.753759	0.687921
3	-0.716651	-0.726870
4	-1.131320	-1.144796
5	-0.671596	-0.554696
6	0.666571	0.657503
7	-0.405871	-0.765469
8	-0.043019	-0.374303
9	0.400488	0.389789

from sklearn import metrics
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, predicted))
print('Mean Squared Error (MSE) :', metrics.mean_squared_error(y_test, predicted))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, predicted)))

Mean Absolute Error (MAE): 0.12894628403303512
Mean Squared Error (MSE) : 0.03027884626475542
Root Mean Squared Error (RMSE): 0.1740081787294937

plt.scatter(dfr.Actual_Price, dfr.Predicted_Price,  color='Darkblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()

plt.plot(dfr.Actual_Price, color='black', label='Actual')
plt.plot(dfr.Predicted_Price, color='blue',label='Predicted')
plt.title("Predictions using  Historical, Sentiment and Trends")
plt.legend()

<matplotlib.legend.Legend at 0x7fc34cef49a0>

Applying K-Means and CNN+LSTM on Historical data:

df_x.head()

	open	high	low	close	adjclose	volume	cluster
0	-1.703818	-1.730261	-1.682950	-1.705764	-1.872820	-0.840392	2
1	-1.505790	-1.470011	-1.490960	-1.477822	-1.658905	-0.124160	4
2	-1.285591	-1.316556	-1.264110	-1.286323	-1.479190	-0.955812	4
3	-1.223978	-1.236431	-1.202859	-1.215514	-1.412737	-0.927118	4
4	-1.153899	-1.174523	-1.112084	-1.138360	-1.340332	-1.205426	2

X = [[df_x.iloc[j, i] for i in range(df_x.shape[1])] for j in range(df_x.shape[0])]
Y = [df_final.iloc[i,8] for i in range(df_final.shape[0])] 
print(X[0:2])
print(Y[0:2])

[[-1.7038183261524975, -1.7302608232721683, -1.6829499939918715, -1.7057639670236395, -1.872820057485404, -0.8403922760048991, 2], [-1.5057895100199, -1.470010607913253, -1.4909599654415322, -1.4778224513157197, -1.6589052190920777, -0.12416025602251, 4]]
[-1.738791872212694, -1.6444381302921762]

from sklearn.model_selection import train_test_split
import numpy as np
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)

train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)

train_X = train_X.reshape(train_X.shape[0],1,7,1)
test_X = test_X.reshape(test_X.shape[0],1,7,1)

print(len(train_X))
print(len(test_X))

107
46

# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError

model = tf.keras.Sequential()

# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None,7, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
#model.add(Dense(5, kernel_regularizer=L2(0.01)))

# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))

#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)

Epoch 1/40
3/3 [==============================] - 17s 2s/step - loss: 0.8435 - mse: 0.8435 - mae: 0.7191 - val_loss: 1.2994 - val_mse: 1.2994 - val_mae: 0.8909
Epoch 2/40
3/3 [==============================] - 0s 41ms/step - loss: 0.7868 - mse: 0.7868 - mae: 0.6972 - val_loss: 1.1748 - val_mse: 1.1748 - val_mae: 0.8485
Epoch 3/40
3/3 [==============================] - 0s 47ms/step - loss: 0.6732 - mse: 0.6732 - mae: 0.6493 - val_loss: 0.8909 - val_mse: 0.8909 - val_mae: 0.7464
Epoch 4/40
3/3 [==============================] - 0s 42ms/step - loss: 0.4508 - mse: 0.4508 - mae: 0.5399 - val_loss: 0.4124 - val_mse: 0.4124 - val_mae: 0.5224
Epoch 5/40
3/3 [==============================] - 0s 49ms/step - loss: 0.1894 - mse: 0.1894 - mae: 0.3295 - val_loss: 0.0471 - val_mse: 0.0471 - val_mae: 0.1709
Epoch 6/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0823 - mse: 0.0823 - mae: 0.2286 - val_loss: 0.1999 - val_mse: 0.1999 - val_mae: 0.3940
Epoch 7/40
3/3 [==============================] - 0s 41ms/step - loss: 0.1960 - mse: 0.1960 - mae: 0.3494 - val_loss: 0.0906 - val_mse: 0.0906 - val_mae: 0.2370
Epoch 8/40
3/3 [==============================] - 0s 47ms/step - loss: 0.0836 - mse: 0.0836 - mae: 0.2222 - val_loss: 0.0874 - val_mse: 0.0874 - val_mae: 0.2231
Epoch 9/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0851 - mse: 0.0851 - mae: 0.2084 - val_loss: 0.1338 - val_mse: 0.1338 - val_mae: 0.2902
Epoch 10/40
3/3 [==============================] - 0s 46ms/step - loss: 0.0925 - mse: 0.0925 - mae: 0.2278 - val_loss: 0.0918 - val_mse: 0.0918 - val_mae: 0.2423
Epoch 11/40
3/3 [==============================] - 0s 60ms/step - loss: 0.0644 - mse: 0.0644 - mae: 0.1962 - val_loss: 0.0461 - val_mse: 0.0461 - val_mae: 0.1661
Epoch 12/40
3/3 [==============================] - 0s 68ms/step - loss: 0.0565 - mse: 0.0565 - mae: 0.1744 - val_loss: 0.0339 - val_mse: 0.0339 - val_mae: 0.1346
Epoch 13/40
3/3 [==============================] - 0s 63ms/step - loss: 0.0756 - mse: 0.0756 - mae: 0.1995 - val_loss: 0.0467 - val_mse: 0.0467 - val_mae: 0.1664
Epoch 14/40
3/3 [==============================] - 0s 65ms/step - loss: 0.0591 - mse: 0.0591 - mae: 0.1964 - val_loss: 0.0582 - val_mse: 0.0582 - val_mae: 0.1929
Epoch 15/40
3/3 [==============================] - 0s 75ms/step - loss: 0.0614 - mse: 0.0614 - mae: 0.1892 - val_loss: 0.0528 - val_mse: 0.0528 - val_mae: 0.1754
Epoch 16/40
3/3 [==============================] - 0s 69ms/step - loss: 0.0429 - mse: 0.0429 - mae: 0.1579 - val_loss: 0.0513 - val_mse: 0.0513 - val_mae: 0.1761
Epoch 17/40
3/3 [==============================] - 0s 76ms/step - loss: 0.0553 - mse: 0.0553 - mae: 0.1851 - val_loss: 0.0463 - val_mse: 0.0463 - val_mae: 0.1687
Epoch 18/40
3/3 [==============================] - 0s 75ms/step - loss: 0.0602 - mse: 0.0602 - mae: 0.1794 - val_loss: 0.0393 - val_mse: 0.0393 - val_mae: 0.1505
Epoch 19/40
3/3 [==============================] - 0s 71ms/step - loss: 0.0561 - mse: 0.0561 - mae: 0.1799 - val_loss: 0.0444 - val_mse: 0.0444 - val_mae: 0.1655
Epoch 20/40
3/3 [==============================] - 0s 66ms/step - loss: 0.0568 - mse: 0.0568 - mae: 0.1719 - val_loss: 0.0313 - val_mse: 0.0313 - val_mae: 0.1319
Epoch 21/40
3/3 [==============================] - 0s 67ms/step - loss: 0.0393 - mse: 0.0393 - mae: 0.1526 - val_loss: 0.0319 - val_mse: 0.0319 - val_mae: 0.1335
Epoch 22/40
3/3 [==============================] - 0s 64ms/step - loss: 0.0532 - mse: 0.0532 - mae: 0.1732 - val_loss: 0.0297 - val_mse: 0.0297 - val_mae: 0.1321
Epoch 23/40
3/3 [==============================] - 0s 66ms/step - loss: 0.0488 - mse: 0.0488 - mae: 0.1769 - val_loss: 0.0392 - val_mse: 0.0392 - val_mae: 0.1575
Epoch 24/40
3/3 [==============================] - 0s 70ms/step - loss: 0.0447 - mse: 0.0447 - mae: 0.1678 - val_loss: 0.0433 - val_mse: 0.0433 - val_mae: 0.1657
Epoch 25/40
3/3 [==============================] - 0s 68ms/step - loss: 0.0409 - mse: 0.0409 - mae: 0.1538 - val_loss: 0.0338 - val_mse: 0.0338 - val_mae: 0.1421
Epoch 26/40
3/3 [==============================] - 0s 71ms/step - loss: 0.0364 - mse: 0.0364 - mae: 0.1565 - val_loss: 0.0295 - val_mse: 0.0295 - val_mae: 0.1335
Epoch 27/40
3/3 [==============================] - 0s 42ms/step - loss: 0.0438 - mse: 0.0438 - mae: 0.1651 - val_loss: 0.0310 - val_mse: 0.0310 - val_mae: 0.1321
Epoch 28/40
3/3 [==============================] - 0s 41ms/step - loss: 0.0585 - mse: 0.0585 - mae: 0.1756 - val_loss: 0.0451 - val_mse: 0.0451 - val_mae: 0.1623
Epoch 29/40
3/3 [==============================] - 0s 40ms/step - loss: 0.0581 - mse: 0.0581 - mae: 0.1712 - val_loss: 0.0460 - val_mse: 0.0460 - val_mae: 0.1629
Epoch 30/40
3/3 [==============================] - 0s 52ms/step - loss: 0.0503 - mse: 0.0503 - mae: 0.1695 - val_loss: 0.0344 - val_mse: 0.0344 - val_mae: 0.1425
Epoch 31/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0552 - mse: 0.0552 - mae: 0.1676 - val_loss: 0.0413 - val_mse: 0.0413 - val_mae: 0.1576
Epoch 32/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0430 - mse: 0.0430 - mae: 0.1617 - val_loss: 0.0427 - val_mse: 0.0427 - val_mae: 0.1586
Epoch 33/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0445 - mse: 0.0445 - mae: 0.1510 - val_loss: 0.0305 - val_mse: 0.0305 - val_mae: 0.1326
Epoch 34/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0552 - mse: 0.0552 - mae: 0.1731 - val_loss: 0.0424 - val_mse: 0.0424 - val_mae: 0.1598
Epoch 35/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0584 - mse: 0.0584 - mae: 0.1776 - val_loss: 0.0609 - val_mse: 0.0609 - val_mae: 0.1907
Epoch 36/40
3/3 [==============================] - 0s 44ms/step - loss: 0.0492 - mse: 0.0492 - mae: 0.1695 - val_loss: 0.0427 - val_mse: 0.0427 - val_mae: 0.1609
Epoch 37/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0339 - mse: 0.0339 - mae: 0.1425 - val_loss: 0.0296 - val_mse: 0.0296 - val_mae: 0.1355
Epoch 38/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0438 - mse: 0.0438 - mae: 0.1539 - val_loss: 0.0316 - val_mse: 0.0316 - val_mae: 0.1407
Epoch 39/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0487 - mse: 0.0487 - mae: 0.1662 - val_loss: 0.0441 - val_mse: 0.0441 - val_mae: 0.1637
Epoch 40/40
3/3 [==============================] - 0s 45ms/step - loss: 0.0563 - mse: 0.0563 - mae: 0.1701 - val_loss: 0.0616 - val_mse: 0.0616 - val_mae: 0.1905

plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()

<matplotlib.legend.Legend at 0x7fc354999cf0>

model.evaluate(test_X, test_Y)

2/2 [==============================] - 0s 12ms/step - loss: 0.0616 - mse: 0.0616 - mae: 0.1905

[0.061608511954545975, 0.061608511954545975, 0.1905282735824585]

from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]

var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)

r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)

var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)

Variance: 0.964044
R2 Score: 0.964044
Max Error: 0.661186

predicted  = model.predict(test_X)

test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
    temp = df_final.iloc[j,3]
    test_label[j - len_t] = test_label[j - len_t] * temp + temp
    predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted  Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()

2/2 [==============================] - 0s 9ms/step

Applying K-Means and CNN+LSTM on Historical + Sentiment + Trends Data:

X = [[df_values.iloc[j, i] for i in range(df_values.shape[1])] for j in range(df_values.shape[0])]
Y = [df_final.iloc[i,8] for i in range(df_final.shape[0])] 
print(X[0:2])
print(Y[0:2])

[[-1.7038183261524975, -1.7302608232721683, -1.6829499939918715, -1.7057639670236395, -1.872820057485404, -0.8403922760048991, 2.580289089276951, -1.1068195873526132, -1.1938008487946932, 1.7664503220919647, -1.5030018624207688, -0.1039065146895656, 1.464048694140032, 2], [-1.5057895100199, -1.470010607913253, -1.4909599654415322, -1.4778224513157197, -1.6589052190920777, -0.12416025602251, 2.198301444710552, -1.1068195873526132, 3.504942032500697, -1.8992863327148637, -1.5030018624207688, 1.2123738405060045, 0.3068368015117597, 4]]
[-1.738791872212694, -1.6444381302921762]

from sklearn.model_selection import train_test_split
import numpy as np
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle=True)

train_X = np.array(x_train)
test_X = np.array(x_test)
train_Y = np.array(y_train)
test_Y = np.array(y_test)

train_X = train_X.reshape(train_X.shape[0],1,14,1)
test_X = test_X.reshape(test_X.shape[0],1,14,1)

print(len(train_X))
print(len(test_X))

107
46

# For creating model and training
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, Flatten
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.metrics import RootMeanSquaredError

model = tf.keras.Sequential()

# Creating the Neural Network model here...
# CNN layers
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu', input_shape=(None,14, 1))))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(128, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Conv1D(64, kernel_size=2, activation='relu')))
model.add(TimeDistributed(MaxPooling1D(1)))
model.add(TimeDistributed(Flatten()))
#model.add(Dense(5, kernel_regularizer=L2(0.01)))

# LSTM layers
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, return_sequences=False)))
model.add(Dropout(0.5))

#Final layers
model.add(Dense(1, activation='linear'))
model.compile(optimizer='adam', loss='mse', metrics=['mse', 'mae'])

history = model.fit(train_X, train_Y, validation_data=(test_X,test_Y), epochs=40,batch_size=40, verbose=1, shuffle =True)

Epoch 1/40
3/3 [==============================] - 13s 893ms/step - loss: 0.9283 - mse: 0.9283 - mae: 0.7439 - val_loss: 1.0816 - val_mse: 1.0816 - val_mae: 0.8186
Epoch 2/40
3/3 [==============================] - 0s 47ms/step - loss: 0.8707 - mse: 0.8707 - mae: 0.7204 - val_loss: 0.9586 - val_mse: 0.9586 - val_mae: 0.7668
Epoch 3/40
3/3 [==============================] - 0s 50ms/step - loss: 0.7502 - mse: 0.7502 - mae: 0.6681 - val_loss: 0.6875 - val_mse: 0.6875 - val_mae: 0.6335
Epoch 4/40
3/3 [==============================] - 0s 52ms/step - loss: 0.4556 - mse: 0.4556 - mae: 0.5155 - val_loss: 0.2252 - val_mse: 0.2252 - val_mae: 0.3242
Epoch 5/40
3/3 [==============================] - 0s 50ms/step - loss: 0.1291 - mse: 0.1291 - mae: 0.2721 - val_loss: 0.1801 - val_mse: 0.1801 - val_mae: 0.3321
Epoch 6/40
3/3 [==============================] - 0s 51ms/step - loss: 0.2382 - mse: 0.2382 - mae: 0.3739 - val_loss: 0.2083 - val_mse: 0.2083 - val_mae: 0.3451
Epoch 7/40
3/3 [==============================] - 0s 51ms/step - loss: 0.1393 - mse: 0.1393 - mae: 0.2724 - val_loss: 0.0689 - val_mse: 0.0689 - val_mae: 0.1855
Epoch 8/40
3/3 [==============================] - 0s 43ms/step - loss: 0.0738 - mse: 0.0738 - mae: 0.2105 - val_loss: 0.1492 - val_mse: 0.1492 - val_mae: 0.2463
Epoch 9/40
3/3 [==============================] - 0s 47ms/step - loss: 0.1093 - mse: 0.1093 - mae: 0.2484 - val_loss: 0.1612 - val_mse: 0.1612 - val_mae: 0.2658
Epoch 10/40
3/3 [==============================] - 0s 53ms/step - loss: 0.1245 - mse: 0.1245 - mae: 0.2644 - val_loss: 0.0971 - val_mse: 0.0971 - val_mae: 0.2057
Epoch 11/40
3/3 [==============================] - 0s 56ms/step - loss: 0.0750 - mse: 0.0750 - mae: 0.1969 - val_loss: 0.0586 - val_mse: 0.0586 - val_mae: 0.1809
Epoch 12/40
3/3 [==============================] - 0s 52ms/step - loss: 0.0657 - mse: 0.0657 - mae: 0.1980 - val_loss: 0.0731 - val_mse: 0.0731 - val_mae: 0.2134
Epoch 13/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0723 - mse: 0.0723 - mae: 0.2060 - val_loss: 0.0743 - val_mse: 0.0743 - val_mae: 0.2120
Epoch 14/40
3/3 [==============================] - 0s 50ms/step - loss: 0.0556 - mse: 0.0556 - mae: 0.1859 - val_loss: 0.0583 - val_mse: 0.0583 - val_mae: 0.1829
Epoch 15/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0518 - mse: 0.0518 - mae: 0.1744 - val_loss: 0.0575 - val_mse: 0.0575 - val_mae: 0.1691
Epoch 16/40
3/3 [==============================] - 0s 54ms/step - loss: 0.0541 - mse: 0.0541 - mae: 0.1732 - val_loss: 0.0638 - val_mse: 0.0638 - val_mae: 0.1766
Epoch 17/40
3/3 [==============================] - 0s 48ms/step - loss: 0.0530 - mse: 0.0530 - mae: 0.1773 - val_loss: 0.0641 - val_mse: 0.0641 - val_mae: 0.1815
Epoch 18/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0559 - mse: 0.0559 - mae: 0.1679 - val_loss: 0.0602 - val_mse: 0.0602 - val_mae: 0.1766
Epoch 19/40
3/3 [==============================] - 0s 47ms/step - loss: 0.0456 - mse: 0.0456 - mae: 0.1608 - val_loss: 0.0615 - val_mse: 0.0615 - val_mae: 0.1804
Epoch 20/40
3/3 [==============================] - 0s 48ms/step - loss: 0.0413 - mse: 0.0413 - mae: 0.1605 - val_loss: 0.0617 - val_mse: 0.0617 - val_mae: 0.1815
Epoch 21/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0497 - mse: 0.0497 - mae: 0.1733 - val_loss: 0.0627 - val_mse: 0.0627 - val_mae: 0.1825
Epoch 22/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0401 - mse: 0.0401 - mae: 0.1543 - val_loss: 0.0651 - val_mse: 0.0651 - val_mae: 0.1765
Epoch 23/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0520 - mse: 0.0520 - mae: 0.1778 - val_loss: 0.0619 - val_mse: 0.0619 - val_mae: 0.1746
Epoch 24/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0423 - mse: 0.0423 - mae: 0.1565 - val_loss: 0.0629 - val_mse: 0.0629 - val_mae: 0.1894
Epoch 25/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0293 - mse: 0.0293 - mae: 0.1376 - val_loss: 0.0655 - val_mse: 0.0655 - val_mae: 0.1943
Epoch 26/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0472 - mse: 0.0472 - mae: 0.1656 - val_loss: 0.0562 - val_mse: 0.0562 - val_mae: 0.1722
Epoch 27/40
3/3 [==============================] - 0s 55ms/step - loss: 0.0426 - mse: 0.0426 - mae: 0.1676 - val_loss: 0.0572 - val_mse: 0.0572 - val_mae: 0.1703
Epoch 28/40
3/3 [==============================] - 0s 48ms/step - loss: 0.0424 - mse: 0.0424 - mae: 0.1590 - val_loss: 0.0587 - val_mse: 0.0587 - val_mae: 0.1770
Epoch 29/40
3/3 [==============================] - 0s 48ms/step - loss: 0.0371 - mse: 0.0371 - mae: 0.1488 - val_loss: 0.0595 - val_mse: 0.0595 - val_mae: 0.1810
Epoch 30/40
3/3 [==============================] - 0s 53ms/step - loss: 0.0390 - mse: 0.0390 - mae: 0.1529 - val_loss: 0.0626 - val_mse: 0.0626 - val_mae: 0.1866
Epoch 31/40
3/3 [==============================] - 0s 55ms/step - loss: 0.0418 - mse: 0.0418 - mae: 0.1524 - val_loss: 0.0601 - val_mse: 0.0601 - val_mae: 0.1780
Epoch 32/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0474 - mse: 0.0474 - mae: 0.1659 - val_loss: 0.0569 - val_mse: 0.0569 - val_mae: 0.1792
Epoch 33/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0389 - mse: 0.0389 - mae: 0.1597 - val_loss: 0.0588 - val_mse: 0.0588 - val_mae: 0.1899
Epoch 34/40
3/3 [==============================] - 0s 50ms/step - loss: 0.0407 - mse: 0.0407 - mae: 0.1512 - val_loss: 0.0592 - val_mse: 0.0592 - val_mae: 0.1928
Epoch 35/40
3/3 [==============================] - 0s 46ms/step - loss: 0.0392 - mse: 0.0392 - mae: 0.1503 - val_loss: 0.0587 - val_mse: 0.0587 - val_mae: 0.1883
Epoch 36/40
3/3 [==============================] - 0s 42ms/step - loss: 0.0445 - mse: 0.0445 - mae: 0.1519 - val_loss: 0.0603 - val_mse: 0.0603 - val_mae: 0.1886
Epoch 37/40
3/3 [==============================] - 0s 49ms/step - loss: 0.0281 - mse: 0.0281 - mae: 0.1283 - val_loss: 0.0613 - val_mse: 0.0613 - val_mae: 0.1905
Epoch 38/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0380 - mse: 0.0380 - mae: 0.1487 - val_loss: 0.0623 - val_mse: 0.0623 - val_mae: 0.1884
Epoch 39/40
3/3 [==============================] - 0s 43ms/step - loss: 0.0316 - mse: 0.0316 - mae: 0.1320 - val_loss: 0.0640 - val_mse: 0.0640 - val_mae: 0.1942
Epoch 40/40
3/3 [==============================] - 0s 51ms/step - loss: 0.0405 - mse: 0.0405 - mae: 0.1428 - val_loss: 0.0701 - val_mse: 0.0701 - val_mae: 0.1984

plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.legend()

<matplotlib.legend.Legend at 0x7fc34d0a2080>

model.evaluate(test_X, test_Y)

2/2 [==============================] - 0s 13ms/step - loss: 0.0701 - mse: 0.0701 - mae: 0.1984

[0.07009752839803696, 0.07009752839803696, 0.19842290878295898]

from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

# predict probabilities for test set
yhat_probs = model.predict(test_X, verbose=0)
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]

var = explained_variance_score(test_Y.reshape(-1,1), yhat_probs)
print('Variance: %f' % var)

r2 = r2_score(test_Y.reshape(-1,1), yhat_probs)
print('R2 Score: %f' % var)

var2 = max_error(test_Y.reshape(-1,1), yhat_probs)
print('Max Error: %f' % var2)

Variance: 0.943367
R2 Score: 0.943367
Max Error: 0.796149

predicted  = model.predict(test_X)

test_label = test_Y.reshape(-1,1)
predicted = np.array(predicted[:,0]).reshape(-1,1)
len_t = len(train_X)
for j in range(len_t , len_t + len(test_X)):
    temp = df_final.iloc[j,3]
    test_label[j - len_t] = test_label[j - len_t] * temp + temp
    predicted[j - len_t] = predicted[j - len_t] * temp + temp
plt.plot(predicted, color = 'green', label = 'Predicted  Stock Price')
plt.plot(test_label, color = 'red', label = 'Real Stock Price')
plt.title(' Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel(' Stock Price')
plt.legend()
plt.show()

2/2 [==============================] - 0s 12ms/step