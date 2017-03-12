1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

def extract_post_content ( file ): soup = BeautifulSoup ( open ( file ) . read (), "html.parser" ) return soup . find ( 'div' , attrs = { 'class' : 'entry-content' }) . text def extract_all_text (): with open ( 'filepaths' ) as f : content = f . readlines () file_list = [ x . strip () for x in content ] return map ( extract_post_content , file_list ) def extract_segments ( data ): seg_list = jieba . cut ( data , cut_all = False ) return [ seg . strip () for seg in seg_list if len ( seg ) > 1 ] def tfidf_calc (): corpus = [ " " . join ( item ) for item in map ( extract_segments , extract_all_text ())] tfidf_calc ()