def_load_stop_words(self): ifnot os.path.exists(self._stop_words_path): raise Exception(f"system stop words: {self._stop_words_path} not found") stop_words = [] withopen(self._stop_words_path, 'r', encoding='utf8') as reader: for line in reader: line = line.strip() stop_words.append(line) return stop_words
def_build_param(self):
def_cal_param(reader_obj): f = [] # 列表的每一个元素是一个dict,dict存储着一个文档中每个词的出现次数 df = {} # 存储每个词及出现了该词的文档数量 idf = {} # 存储每个词的idf值 lines = reader_obj.readlines() length = len(lines) words_count = 0 docs_list = [] line_length_list =[] for line in lines: line = line.strip() ifnot line: continue words = [word for word in jieba.lcut(line) if word and word notin self._stop_words] line_length_list.append(len(words)) docs_list.append(line) words_count += len(words) tmp_dict = {} for word in words: tmp_dict[word] = tmp_dict.get(word, 0) + 1 f.append(tmp_dict) for word in tmp_dict.keys(): df[word] = df.get(word, 0) + 1 for word, num in df.items(): idf[word] = math.log(length - num + 0.5) - math.log(num + 0.5) param = BM25Param(f, df, idf, length, words_count / length, docs_list, line_length_list) return param
# cal if self.docs: ifnot os.path.exists(self.docs): raise Exception(f"input docs {self.docs} not found") withopen(self.docs, 'r', encoding='utf8') as reader: param = _cal_param(reader)
else: ifnot os.path.exists(self._docs_path): raise Exception(f"system docs {self._docs_path} not found") withopen(self._docs_path, 'r', encoding='utf8') as reader: param = _cal_param(reader)
withopen(self._param_pkl, 'wb') as writer: pickle.dump(param, writer) return param
def_cal_similarity(self, words, index): score = 0 for word in words: if word notin self.param.f[index]: continue molecular = self.param.idf[word] * self.param.f[index][word] * (self.param.k1 + 1) denominator = self.param.f[index][word] + self.param.k1 * (1 - self.param.b + self.param.b * self.param.line_length_list[index] / self.param.avg_length) score += molecular / denominator return score
defcal_similarity(self, query: str): """ 相似度计算,无排序结果 :param query: 待查询结果 :return: [(doc, score), ..] """ words = [word for word in jieba.lcut(query) if word and word notin self._stop_words] score_list = [] for index inrange(self.param.length): score = self._cal_similarity(words, index) score_list.append((self.param.docs_list[index], score)) return score_list