defdistinct_words(corpus): """ Determine a list of distinct words for the corpus. Params: corpus (list of list of strings): corpus of documents Return: corpus_words (list of strings): sorted list of distinct words across the corpus n_corpus_words (integer): number of distinct words across the corpus """ corpus_words = [] n_corpus_words = -1 # ------------------ # Write your implementation here. corpus_words=list(sorted({y for x in corpus for y in x})) n_corpus_words=len(corpus_words) # ------------------
defcompute_co_occurrence_matrix(corpus, window_size=4): """ Compute co-occurrence matrix for the given corpus and window_size (default of 4). Note: Each word in a document should be at the center of a window. Words near edges will have a smaller number of co-occurring words. For example, if we take the document "<START> All that glitters is not gold <END>" with window size of 4, "All" will co-occur with "<START>", "that", "glitters", "is", and "not". Params: corpus (list of list of strings): corpus of documents window_size (int): size of context window Return: M (a symmetric numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): Co-occurence matrix of word counts. The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function. word2ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M. """ words, n_words = distinct_words(corpus) M = None word2ind = {} # ------------------ # Write your implementation here. M=np.zeros((n_words,n_words)) word2ind={word:ix for ix,word inenumerate(words)} for sentence in corpus: for i,word inenumerate(sentence): for context in sentence[max(0,i-window_size):i]: M[word2ind[word]][word2ind[context]]+=1 for context in sentence[i+1:min(len(sentence),i+window_size+1)]: M[word2ind[word]][word2ind[context]]+=1 # ------------------
return M, word2ind
Question 1.3: Implement reduce_to_k_dim
编写一个方法,对矩阵进行降维以生成 k 维嵌入。使用 SVD 提取前 k 个成分,并生成一个新的 k 维嵌入矩阵。
defreduce_to_k_dim(M, k=2): """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words) to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn: - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html Params: M (numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): co-occurence matrix of word counts k (int): embedding size of each word after dimension reduction Return: M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings. In terms of the SVD from math class, this actually returns U * S """ n_iters = 10# Use this parameter in your call to `TruncatedSVD` M_reduced = None print("Running Truncated SVD over %i words..." % (M.shape[0])) # ------------------ # Write your implementation here. svd = TruncatedSVD(n_components=k, n_iter=n_iters) M_reduced=svd.fit_transform(M) # ------------------
defplot_embeddings(M_reduced, word2ind, words): """ Plot in a scatterplot the embeddings of the words specified in the list "words". NOTE: do not plot all the words listed in M_reduced / word2ind. Include a label next to each point. Params: M_reduced (numpy matrix of shape (number of unique words in the corpus , 2)): matrix of 2-dimensioal word embeddings word2ind (dict): dictionary that maps word to indices for matrix M words (list of strings): words whose embeddings we want to visualize """
# ------------------ # Write your implementation here. index=[ word2ind[word] for word in words] print(index) print(M_reduced.shape) X=M_reduced[index] plt.scatter(X[:,0],X[:,1]) for i,word inenumerate(words): plt.text(X[i,0],X[i,1],word) plt.title("word embeddings") plt.show() # ------------------
# ----------------------------- # Run This Cell to Produce Your Plot # ------------------------------ imdb_corpus = read_corpus() M_co_occurrence, word2ind_co_occurrence = compute_co_occurrence_matrix(imdb_corpus) M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2)
# Rescale (normalize) the rows to make them each of unit-length M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1) M_normalized = M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcasting
defget_matrix_of_vectors(wv_from_bin, required_words): """ Put the GloVe vectors into a matrix M. Param: wv_from_bin: KeyedVectors object; the 400000 GloVe vectors loaded from file Return: M: numpy matrix shape (num words, 200) containing the vectors word2ind: dictionary mapping each word to its row number in M """ import random words = list(wv_from_bin.index_to_key) print("Shuffling words ...") random.seed(225) random.shuffle(words) print("Putting %i words into word2ind and matrix M..." % len(words)) word2ind = {} M = [] curInd = 0 for w in words: try: M.append(wv_from_bin.get_vector(w)) word2ind[w] = curInd curInd += 1 except KeyError: continue for w in required_words: if w in words: continue try: M.append(wv_from_bin.get_vector(w)) word2ind[w] = curInd curInd += 1 except KeyError: continue M = np.stack(M) print("Done.") return M, word2ind
1 2 3 4 5 6 7 8 9 10
# ----------------------------------------------------------------- # Run Cell to Reduce 200-Dimensional Word Embeddings to k Dimensions # Note: This should be quick to run # ----------------------------------------------------------------- M, word2ind = get_matrix_of_vectors(wv_from_bin, words) M_reduced = reduce_to_k_dim(M, k=2)
# Rescale (normalize) the rows to make them each of unit-length M_lengths = np.linalg.norm(M_reduced, axis=1) M_reduced_normalized = M_reduced / M_lengths[:, np.newaxis] # broadcasting
Question 2.1: GloVe Plot Analysis
Run the cell below to plot the 2D GloVe embeddings for ['movie', 'book', 'mysterious', 'story', 'fascinating', 'good', 'interesting', 'large', 'massive', 'huge'].
# ------------------ # Write your implementation here. w1='yes' w2='correct' w3='no' print(f"the distance between {w1} and {w2} is {wv_from_bin.distance(w1,w2)}") print(f"the distance between {w1} and {w3} is {wv_from_bin.distance(w1,w3)}")
# ------------------
1 2
the distance between yes and correct is 0.6043116450309753 the distance between yes and no is 0.4196847677230835
# Run this cell to answer the analogy -- man : grandfather :: woman : x pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'grandfather'], negative=['man']))
# Run this cell # Here `positive` indicates the list of words to be similar to and `negative` indicates the list of words to be # most dissimilar from.