-
Notifications
You must be signed in to change notification settings - Fork 0
/
genFeatureMat_GloVe.py
executable file
·99 lines (84 loc) · 3.5 KB
/
genFeatureMat_GloVe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/python
import json
import os
import en
import nltk
import numpy as np
import util
# Use pretrained word vector to generate our target features
# required input data:
# ./input/glove.xB.xd.txt
# ./input/stopWords
# ./input/stockReturns.json
# ./input/news_reuters.csv
# ./input/featureMatrix
# output file name:
# input/featureMatrix_train
# input/featureMatrix_test
def wordVec(glove_file):
wordDict = {}
with open(glove_file) as f:
print("Loading word vector ...")
for line in f:
line = line.strip().split(' ')
key, values = line[0], map(float, line[1:])
wordDict[key] = values
return wordDict, len(values) # return word vector and word vector dimension
def gen_FeatureMatrix(news_file, price_file, stopWords_file, output, wordDict, dim_wordVec, sentense_len, term_type, mtype):
with open(price_file) as file:
print("Loading price info ...")
priceDt = json.load(file)[term_type]
cnt = 0
testDates = util.dateGenerator(300)
os.system('rm ' + output + mtype)
stopWords = set()
with open(stopWords_file) as file:
for word in file:
stopWords.add(word.strip())
with open(news_file) as f:
for line in f:
line = line.strip().split(',')
if len(line) != 6: continue
'''
newsType: [topStory, normal]
'''
ticker, name, day, headline, body, newsType = line
if newsType != 'topStory': continue # skip normal news
if ticker not in priceDt: continue # skip if no corresponding company found
if day not in priceDt[ticker]: continue # skip if no corresponding date found
cnt += 1
# if cnt > 20: continue
if cnt % 1000 == 0: print("%sing samples %d" % (mtype, cnt))
if mtype == "test" and day not in testDates: continue
if mtype == "train" and day in testDates: continue
# 2.1 tokenize sentense, check if the word belongs to the top words, unify the format of words
#headline = headline.encode('utf-8')
#body = body.encode('utf-8')
tokens = nltk.word_tokenize(headline) # + nltk.word_tokenize(body)
tokens = map(util.unify_word, tokens)
# build feature and label
feature = np.zeros([0, dim_wordVec])
featureNone = True
for t in tokens:
# if t in stopWords: continue
if t not in wordDict: continue
featureNone = False
feature = np.vstack((feature, np.matrix(wordDict[t])))
if featureNone: continue # feature is empty, continue
feature = util.padding(feature, sentense_len)
label = round(priceDt[ticker][day], 6)
with open(output + mtype, 'a+') as file:
np.savetxt(file, np.hstack((feature, np.matrix(label))), fmt='%.5f')
def main():
glove_file = "./input/glove.6B.100d.txt"
news_file = "./input/news_reuters.csv"
stopWords_file = "./input/stopWords"
price_file = "./input/stockReturns.json"
output = './input/featureMatrix_'
sentense_len = 20
term_type = 'short'
wordDict, dim_wordVec = wordVec(glove_file)
gen_FeatureMatrix(news_file, price_file, stopWords_file, output, wordDict, dim_wordVec, sentense_len, term_type, 'train')
gen_FeatureMatrix(news_file, price_file, stopWords_file, output, wordDict, dim_wordVec, sentense_len, term_type, 'test')
if __name__ == "__main__":
main()