#!/usr/bin/env python
#encoding=utf-8
import redis,codecs,sys,time,datetime,doctest,re
reload(sys)
sys.setdefaultencoding('utf8')
class Unbuffered:
??? def __init__(self, stream):
??????? self.stream = stream
??? def write(self, data):
??????? self.stream.write(data)
??????? self.stream.flush()
??? def __getattr__(self, attr):
??????? return getattr(self.stream, attr)
sys.stdout = Unbuffered(sys.stdout)
def read_keys():
??? keys=r.keys()
??? r=redis.Redis(host='localhost',db=6)
??? print len(keys)
??? f=codecs.open("query_keys.txt","w","utf-8")
??? #print r.info()
??? for key in keys:
??? ??? print key
??? ??? #print type(key)
??? ??? f.write("%s\n"%(key,))
??? f.close()
def read_relevent_words():
??? keys=r.keys()
??? r=redis.Redis(host='localhost',db=6)
??? print len(keys)
??? f=codecs.open("query_relevent_words.txt","w","utf-8")
??? for key in keys:
#??? ??? print r.get(key)
??? ??? f.write("%s\n"%(r.get(key),))
??? f.close()
def parser_one_line_one_words():
??? ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
??? f=codecs.open("query_relevent_words.txt","r","utf-8")
??? for line in f.readlines():
??? ??? li=line.strip().split("*")
??? ??? for elem in li:
??? ??? ??? ff.write("%s\n"%(elem,))
??? ff.close()
def parser_one_line_one_words2():
??? s=set()
??? ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
??? f=codecs.open("query_relevent_words.txt","r","utf-8")
??? for line in f.readlines():
??? ??? li=line.strip().split("*")
??? ??? for elem in li:
??? ??? ??? s.add(elem.strip())
??? ??? ??? ff.write("%s\n"%(elem,))
??? ff.close()
??? print len(s)
def compare_pareser_one_line_one_words_result_lost_line_for_tmp():
??? f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
??? f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
??? count=0
??? for a,b in zip(f1.readlines(),f2.readlines()):
??? ??? count+=1
??? ??? if a.strip()<>b.replace(" ","").strip():
??? ??? ??? print count,a,b
??? ??? ??? time.sleep(5)???
def build_invert_index():
??? """
??? 對wname建倒排索引
??? 以set結(jié)構(gòu)存放倒排數(shù)據(jù)
??? """
??? r=redis.Redis(db=1)
??? p=r.pipeline()
??? count=0
??? #for line in codecs.open("../result_text.txt","r","utf-8").readlines():
??? for line in codecs.open("../output_result_process","r","utf-8").readlines():
??? ??? count+=1
??? ??? #if count<2553148:
??? ??? #??? continue
??? ??? #print count
??? ??? #print line,
??? ??? #print line.strip().split(" ").__len__()
??? ??? for elem in line.strip().split(" "):
??? ??? ??? p.sadd(elem.strip(),count)
??? ??? if count%10000==0:
??? ??? ??? print count
??? ??? ??? print "batch insert to redis ..."
??? ??? ??? s=datetime.datetime.now()
??? ??? ??? p.execute()
??? ??? ??? e=datetime.datetime.now()
??? ??? ??? print "done:%s"%((e-s).seconds)
??? p.execute()
def is_chinese(uchar):
??? """
??? 判斷一個(gè)unicode是否是漢字
??? >>> is_chinese(u"人")
??? True
??? >>> is_chinese("人")
??? True
??? >>> is_chinese("1")
??? False
??? >>> is_chinese(" ")
??? False
??? """
??? if type(uchar)==type(""):
??? ??? u=uchar.decode("utf-8","ignore")
??? else:
??? ??? u=uchar.encode("utf-8","ignore")
??? if len(u)!=len(uchar):
??? ??? return True
??? else:
??? ??? return False
??? #if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
??? #??? return True
??? #else:
??? #??? return False
def is_number(uchar):
??? """判斷一個(gè)unicode是否是數(shù)字"""
??? if uchar >= u'\u0030' and uchar<=u'\u0039':
??????? return True
??? else:
??????? return False
???
def is_alphabet(uchar):
??? """
??? ??? 判斷一個(gè)unicode是否是英文字母
??? ???
??? ??? #>>> is_alphabet(u"t")
??? ??? #True???
??? ???
??? ??? #>>> is_alphabet("t")
??? ??? #True
??? ??? """
??? if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
??????? return True
??? else:
??????? return False
def is_other(uchar):
??? """判斷是否非漢字,數(shù)字和英文字符"""
??? if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
??????? return True
??? else:
?????? return False
def _filter(line):
??? """
??? 對分詞后的文本wname字符進(jìn)行非中文漢字、字母、數(shù)字的替換
??? """
??? r=[]
??? for elem in line.strip().split(" "):
??? ??? element=elem.strip()
??? ??? if type(element)<>type(u""):
??? ??? ??? element=element.decode("utf-8","ignore")
??? ??? if is_other(element)==False:
??? ??? ??? r.append(element)
??? return " ".join(r)
def post_process_wname_segments_illegal_characters():
??? f=codecs.open("../output_result_process","w","utf-8")
??? for line in codecs.open("../output_result","r","utf-8").readlines():
??? ??? s=_filter(line)
??? ??? print s
??? ??? f.write(_filter(line)+"\n")
??? f.close()
def build_word_segments_hash_map():
??? """
??? 給查詢詞和相關(guān)詞建立原詞-分詞結(jié)果之間的hashmap
??? """
??? r2=redis.Redis(db=2)
??? p=r2.pipeline()
??? f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
??? #f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
??? f2=codecs.open("parser_one_line_one_words_uniq_result_pku.txt","r","utf-8")
??? count=0
??? for a,b in zip(f1.readlines(),f2.readlines()):
??? ??? count+=1
??? ??? p.set(a.strip(),b.strip())
??? ??? if count%10000==0:
??? ??? ??? print count
??? ??? ??? print "batch insert to redis ..."
??? ??? ??? s=datetime.datetime.now()
??? ??? ??? p.execute()
??? ??? ??? e=datetime.datetime.now()
??? ??? ??? print "done:%s"%((e-s).seconds)
??? p.execute()
??? f1=codecs.open("query_keys.txt","r","utf-8")
??? #f2=codecs.open("query_keys_result.txt","r","utf-8")
??? f2=codecs.open("query_keys_result_pku.txt","r","utf-8")
??? count=0
??? for a,b in zip(f1.readlines(),f2.readlines()):
??? ??? count+=1
??? ??? p.set(a.strip(),b.strip())
??? ??? if count%10000==0:
??? ??? ??? print count
??? ??? ??? print "batch insert to redis ..."
??? ??? ??? s=datetime.datetime.now()
??? ??? ??? p.execute()
??? ??? ??? e=datetime.datetime.now()
??? ??? ??? print "done:%s"%((e-s).seconds)
??? p.execute()
??? r2.bgsave()
def _build_list_for_inter_args(s1,s2):
??? """
??? 將分詞后的字符串組合成一個(gè)list形式反加給r.sinter使用,去除無用的東西
??? """
??? r=[]
??? r.extend(s1.split(" "))
??? r.extend(s2.split(" "))
??? return [elem.strip() for elem in r if elem.strip()<>""]
def final_find_synomns_out():
??? """
??? """
??? #f=codecs.open("synomns.txt","w","utf-8")
??? f=codecs.open("synomns_pku.txt","w","utf-8")
??? r1=redis.Redis(db=1)
??? r2=redis.Redis(db=2)
??? f1=codecs.open("query_keys.txt","r","utf-8")
??? f2=codecs.open("query_relevent_words.txt","r","utf-8")
??? count=0
??? validateCount=0
??? for a,b in zip(f1.readlines(),f2.readlines()):
??? ??? count+=1
??? ??? #print count
??? ??? query_segments=r2.get(a.strip())
??? ??? for elem in b.split("*"):
??? ??? ??? if elem.strip()=="":
??? ??? ??? ??? continue
??? ??? ??? if r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(elem.strip()))).__len__()>0:
??? ??? ??? ??? validateCount+=1
??? ??? ??? ??? if validateCount%1000==0:
??? ??? ??? ??? ??? print "validateCount:%s\n"%validateCount
??? ??? ??? ??? f.write("%s|||%s\n"%(a.strip(),elem.strip()))
??? ??? ??? ??? f.flush()
??? f.close()
def interactive_mode():
??? while(True):
??? ??? r1=redis.Redis(db=1)
??? ??? r2=redis.Redis(db=2)
??? ??? input=raw_input("input query|||relevent_word:\n")
??? ??? a,b=input.strip().split("|||")
??? ??? query_segments=r2.get(a.strip())
??? ??? print a.strip(),"==>",query_segments
??? ??? print b.strip(),"==>",r2.get(b.strip())
??? ??? print r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b.strip())))
??? ??? print "========="
def c1(line):
??? """
??? 空格切分
??? >>> c1("執(zhí)手|||把手")
??? False
??? """
??? a,b=line.strip().split("|||")
??? return a.split(" ").__len__()>1 or b.split(" ").__len__()>1
r2=redis.Redis(db=2)
def c4(s1,s2):
??? """
??? #>>> c4("尤利西斯","追憶逝水年華")
??? False
??? #>>> c4("A B C","A B")
??? True
??? >>> c4("無線鼠套裝","無線鍵鼠套裝")
??? False
??? #>>> c4("A B","A C")
??? False
??? #>>> c4("A B","A C")
??? False
??? #>>> c4("A","A")
??? True
??? >>> c4("行政職業(yè)能力測驗(yàn)真題","行測真題")
??? False
??? #>>> c4("B","C")
??? False
??? """
??? if s1==s2:
??? ??? return True
??? global r2
??? set1=set()
??? set2=set()
??? if r2.exists(s1):
??? ??? s1=r2.get(s1).strip()
??? if s1.find(" ")>-1:
??? ??? set1=set([elem.strip() for elem in s1.split(" ") if elem.strip()<>""])
??? else:
??? ??? set1=set([s1.strip()])
??? if r2.exists(s2):
??? ??? s2=r2.get(s2).strip()
??? if s2.find(" ")>-1:
??? ??? set2=set([elem.strip() for elem in s2.split(" ") if elem.strip()<>""])
??? else:
??? ??? set2=set([s2.strip()])
??? #print set1,set2
??? #for elem in set1:
??? #??? print elem,
??? #print "=========="
??? #for elem in set2:
??? #??? print elem,
??? inster=set1 & set2
??? if inster.__len__()==0:
??? ??? return False
??? if inster.__len__()<min(set1.__len__(),set2.__len__()):
??? ??? return False
??? else:
??? ??? return True
???
???
def c3(line):
??? """
??? >>> c3("執(zhí)手|||把手")
??? False
??? >>> c3("the north face|||tnf")
??? False
???
??? >>> c3("the 大north face|||tnf")
??? True
???
??? >>> c3("wd1tb|||i5 2320")
??? True
??? """
??? def is_en_or_num(s):
??? ??? #if re.match(r"[a-zA-A0-9]{1,}\Z",s):
??? ??? if re.match(r"[a-zA-Z]{1,}\Z",s.strip()):
??? ??? ??? return True
??? ??? else:
??? ??? ??? return False
??? def f(list):
??? ??? r=set()
??? ??? def _f(s1,s2):
??? ??? ??? r.add(is_en_or_num(s1) & is_en_or_num(s2))
??? ??? ??? return s2
??? ??? reduce(_f,list)
??? ??? if False in r:
??? ??? ??? return True
??? ??? else:
??? ??? ??? return False
??? a,b=line.strip().split("|||")
??? if a.split(" ").__len__()>1 and b.split(" ").__len__()>1:
??? ??? return f(a.split(" ")) & f(b.split(" "))
??? if a.split(" ").__len__()==1 and b.split(" ").__len__()>1:
??? ??? return f(b.split(" "))
??? if a.split(" ").__len__()>1 and b.split(" ").__len__()==1:
??? ??? return f(a.split(" "))
??? if a.split(" ").__len__()==1 and b.split(" ").__len__()==1:
??? ??? return False
??? ???
def c2(line):
??? """
??? 包含子串
??? >>> c2("執(zhí)手|||把手")
??? False
???
??? >>> c2("濃縮咖啡|||咖啡")
??? True
??? """
??? a,b=line.strip().split("|||")
??? return (a in b) or (b in a)
def filter_synonym_result():
??? """
??? 將pku分詞獲得的query和relevent_word有交集的synomns_pku.txt,
??? 對其結(jié)果進(jìn)行過濾
??? 過濾掉以下條件:
??? 有空格切分的
??? 包含子串的進(jìn)行過濾
??? """
???
??? f=codecs.open("synomns_pku_filter.txt","w","utf-8")
??? for line in codecs.open("synomns_pku.txt","r","utf-8").readlines():
??? ??? if c1(line)==False and c2(line)==False:
??? ??? ??? f.write(line)
??? f.close()
???
def test_redis_is_ready():
??? """
??? 測試redis啟動(dòng)OK了
??? """??? ???
??? r=redis.Redis()
??? print r.info()
???
def pivot_query_relvent_word_order_and_intersation_size():
??? """
??? 將結(jié)果以
??? Query為key
??? hashmap為value
??? hashmap的key為relevent word
??? ??? ??? ??? ?value為list [intersation_size,relevent word order]
??? """???
??? debug=False
??? r1=redis.Redis(db=1)
??? r2=redis.Redis(db=2)
??? r3=redis.Redis(db=3)
??? #r3.flushdb()
??? p=r3.pipeline()
??? def step1():
??? ??? """
??? ??? 從synomns_pku_filter.txt中初始化存儲(chǔ)的格式為hmap格式
??? ??? """
??? ??? count=0
??? ??? for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
??? ??? ??? count+=1
??? ??? ??? a,b=line.split("|||")
??? ??? ??? a=a.strip()
??? ??? ??? b=b.strip()
??? ??? ??? #print type(a),type(b)
??? ??? ??? #print a,b
??? ??? ??? p.hset(a,b,[])
??? ??? ??? if count%10000==0:
??? ??? ??? ??? p.execute()
??? ??? ??? ??? print "執(zhí)行一次批量提交redis操作"
??? ??? ??? if count==1 and debug==True:
??? ??? ??? ??? break
??? ??? p.execute()???
??? #step1()
??? def step2():
??? ??? """
??? ??? 將相關(guān)詞的順序插入到redis的hmap 的value中
??? ??? """
??? ??? count=0
??? ??? exists_count=0
??? ??? not_exists_count=0
??? ??? f1=codecs.open("query_keys.txt","r","utf-8")
??? ??? f2=codecs.open("query_relevent_words.txt","r","utf-8")
??? ??? for a,b in zip(f1.readlines(),f2.readlines()):
??? ??? ??? count+=1
??? ??? ??? a=a.strip()
??? ??? ??? b=b.strip()
??? ??? ??? for idx,elem in enumerate(b.split("*")):
??? ??? ??? ??? element=elem.strip()
??? ??? ??? ??? if element=="":
??? ??? ??? ??? ??? continue
??? ??? ??? ??? #print type(a),type(element)
??? ??? ??? ??? #print a,b,element
??? ??? ??? ??? if r3.hexists(a,element):
??? ??? ??? ??? ??? exists_count+=1
??? ??? ??? ??? ??? r3.hset(a,element,[idx+1])
??? ??? ??? ??? else:
??? ??? ??? ??? ??? not_exists_count+=1
??? ??? ??? ??? ??? #print "%s,%s not exists in redis"%(a,element)
??? ??? ??? ??? if count%10000==0:
??? ??? ??? ??? ??? print "exists_count:%s"%exists_count
??? ??? ??? ??? ??? print "not_exists_count:%s"%not_exists_count
??? ??? ??? ??? ???
??? ??? ??? if count==1 and debug==True:
??? ??? ??? ??? break??? ???
??? ??? print "exists_count:%s"%exists_count
??? ??? print "not_exists_count:%s"%not_exists_count
??? ??? print "step2 finished"
??? #step2()
??? def test_step1_and_step2_is_ok():
??? ??? """
??? ??? """
??? ??? result=r3.hget("透明茶杯","茶具")
??? ??? if type([])==type(eval(result)):
??? ??? ??? print "正確"
??? ??? else:
??? ??? ??? print "不正確"
??? #test_step1_and_step2_is_ok()
??? def step3():
??? ??? """
??? ??? 將有交集結(jié)果的數(shù)據(jù)重新再跑一遍并將交集的大小改寫到hmap的value中
??? ??? """
??? ??? count=0
??? ??? validateCount=0
??? ??? for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
??? ??? ??? a,b=line.strip().split("|||")
??? ??? ??? a=a.strip()
??? ??? ??? b=b.strip()
??? ??? ??? count+=1
??? ??? ??? #print count
??? ??? ??? query_segments=r2.get(a)
??? ??? ??? intersation_len=r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b))).__len__()
??? ??? ??? if intersation_len>0:
??? ??? ??? ??? ??? list_result=eval(r3.hget(a,b))
??? ??? ??? ??? ??? if len(list_result)<>1:
??? ??? ??? ??? ??? ??? print a,b
??? ??? ??? ??? ??? ??? print type(a),type(b)
??? ??? ??? ??? ??? ??? print "ERROR"
??? ??? ??? ??? ??? ??? exit(-1)
??? ??? ??? ??? ??? #print type(list_result)==type([])
??? ??? ??? ??? ??? list_result.append(intersation_len)
??? ??? ??? ??? ??? r3.hset(a,b,list_result)
??? ??? ??? ??? ??? validateCount+=1
??? ??? ??? ??? ??? if validateCount%1000==0:
??? ??? ??? ??? ??? ??? print "validateCount:%s\n"%validateCount
??? ??? print "final validateCount %s"%validateCount
??? #step3()
???
??? def step4():
??? ??? """
??? ??? 將存儲(chǔ)在redis hmap中的結(jié)構(gòu)進(jìn)行輸出
??? ??? """
??? ??? def cmp(x,y):
??? ??? ??? if x[1][1]<y[1][1]:
??? ??? ??? ??? return 1
??? ??? ??? elif x[1][1]>y[1][1]:
??? ??? ??? ??? return -1
??? ??? ??? else:
??? ??? ??? ??? if x[1][0]<y[1][0]:
??? ??? ??? ??? ??? return 1
??? ??? ??? ??? elif x[1][0]>y[1][0]:
??? ??? ??? ??? ??? return -1
??? ??? ??? ??? else:
??? ??? ??? ??? ??? return 0
??? ??? f=codecs.open("synomns_pku_filter_process.txt","w","utf-8")
??? ??? #[('b', [2, 4]), ('a', [1, 3])]
??? ??? count=0
??? ??? for key in r3.keys():
??? ??? ??? count+=1
??? ??? ??? print count
??? ??? ??? f.write("%s"%key)
??? ??? ??? z=r3.hgetall(key)
??? ??? ??? for k,v in z.iteritems():
??? ??? ??? ??? z[k]=eval(v)
??? ??? ??? for elem in sorted(z.items(),cmp):
??? ??? ??? ??? word,orders=elem
??? ??? ??? ??? f.write("|||%s,%s"%(word,str(orders)))
??? ??? ??? f.write("\n")
??? ??? ??? f.flush()
??? ??? f.close()
??? ???
??? step4()
def _find_short_name(s1,s2):
??? """
??? >>> _find_short_name("行測","行政能力測試")
??? True
??? >>> _find_short_name("AB","ABC")
??? False
??? >>> _find_short_name("A","D")
??? False
??? """
??? if len(s1)>=len(s2):
??? ??? return False
??? if s1 in s2:
??? ??? return False
??? return set(s1).issubset(set(s2))???
def find_short_name():
??? """
??? 在synomns_pku_filter.txt中查找簡稱
??? """
??? for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
??? ??? a,b=line.strip().split("|||")
??? ??? a=a.strip()
??? ??? b=b.strip()
??? ??? if _find_short_name(a,b) or _find_short_name(b,a):
??? ??? ??? print "%s|||%s"%(a,b)
def find_short_name2():
??? """
??? 在原query和relevent word中查找簡稱
??? """
??? f=codecs.open("short_name_global.txt","w","utf-8")
??? count=0
??? validateCount=0
??? f1=codecs.open("query_keys.txt","r","utf-8")
??? f2=codecs.open("query_relevent_words.txt","r","utf-8")
??? for a,b in zip(f1.readlines(),f2.readlines()):
??? ??? count+=1
??? ??? #if count<146146:
??? ??? #??? continue
??? ??? #else:
??? ??? #??? print a,b
??? ??? #if count<146148:
??? ??? #??? print "stop..."
??? ??? #??? time.sleep(100000)
??? ??? a=a.strip()
??? ??? b=b.strip()
??? ??? for idx,elem in enumerate(b.split("*")):
??? ??? ??? element=elem.strip()
??? ??? ??? if element=="":
??? ??? ??? ??? continue
??? ??? ??? line="%s|||%s\n"%(a,element)
??? ??? ??? #print line
??? ??? ??? #time.sleep(4000)
??? ??? ??? if c3(line)==False and c2(line)==False and c4(a,element)==False:
??? ??? ??? ??? #if _find_short_name(a,element) or _find_short_name(element,a):
??? ??? ??? ??? validateCount+=1
??? ??? ??? ??? #if validateCount%10000==0:
??? ??? ??? ??? print "validateCount:%s"%validateCount
??? ??? ??? ??? print line
??? ??? ??? ??? #time.sleep(100000)
??? ??? ??? ??? f.write(line)
??? ??? ??? ??? f.flush()
??? ??? if count%10000==0:
??? ??? ??? ??? print "cout===========>%s"%count
??? f.close()
??? print "validateCount:%s"%validateCount
??? print "cout===========>%s"%count
def test_sorted():
??? a=[('a',[1,2]),("b",[0,2]),("c",[-1,3])]
??? def cmp(x,y):
??? ??? if x[1][1]<y[1][1]:
??? ??? ??? return 1
??? ??? elif x[1][1]>y[1][1]:
??? ??? ??? return -1
??? ??? else:
??? ??? ??? if x[1][0]<y[1][0]:
??? ??? ??? ??? return 1
??? ??? ??? elif x[1][0]>y[1][0]:
??? ??? ??? ??? return -1
??? ??? ??? else:
??? ??? ??? ??? return 0
??? print sorted(a,cmp)
def _find_only_one_word_difference(line):
??? """
??? >>> _find_only_one_word_difference("毛領(lǐng)毛衣|||毛領(lǐng)衣服")
??? True
??? """
??? return True
def find_only_one_word_difference():
??? f=codecs.open("./short_name_global_filter.txt","w","utf-8")
??? for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
??? ??? a,b=line.strip().split("|||")
??? ??? if len(a)==len(b) and a<>b:
??? ??? ??? #print type(a),type(b)
??? ??? ??? set1=set(a)
??? ??? ??? set2=set(b)
??? ??? ??? inster=set1&set2
??? ??? ??? m=len(a)-1
??? ??? ??? if m>0 and inster.__len__()==m:
??? ??? ??? ??? if ((set1-set2).__len__()>0 and str(list(set1-set2)[0]).isdigit()==False) or \
??? ??? ??? ??? ??? ? ((set2-set1).__len__()>0 and str(list(set2-set1)[0]).isdigit()==False):???
??? ??? ??? ??? ??? ??? f.write(line)
??? f.close()
??? ??? ???
def find_human_names():
??? """
??? 從query_relevent_word中找一批人名
??? 蘇軾 蘇東坡
??? """
??? xins=['白','畢','卞','蔡','曹','岑','常','車','陳','成','程','池','鄧','丁','范','方','樊','費(fèi)','馮','符','傅','甘','高','葛','龔','古','關(guān)','郭','韓','何','賀','洪','侯','胡','華','黃','霍','姬','簡','江','姜','蔣','金','康','柯','孔','賴','郎','樂','雷','黎','李','連','','梁','廖','林','凌','劉','柳','龍','盧','魯','陸','路','呂','羅','駱','馬','梅','孟','莫','母','穆','倪','寧','歐','區(qū)','潘','彭','','皮','齊','戚','錢','強(qiáng)','秦','丘','邱','饒','任','沈','盛','施','石','時(shí)','史','司徒','蘇','孫','譚','湯','唐','陶','田','童','涂','王','危','韋','衛(wèi)','魏','溫','文','翁','巫','鄔','吳','伍','武','席','夏','蕭','謝','辛','邢','徐','許','薛','嚴(yán)','顏','楊','葉','易','殷','尤','于','余','俞','虞','元','袁','岳','云','曾','詹','張','章','趙','鄭','鐘','周','鄒','朱','褚','莊','卓']
??? xins+=['李','王','張','劉','陳','黃','周','吳','徐','孫','胡','朱','高','林','何','郭','馬','羅','梁','宋','鄭','謝','韓','唐','馮','于','董','蕭','程','曹','袁','鄧','許','傅','沈','曾','彭','呂','蘇','盧','蔣','蔡','賈','丁','魏','薛','葉','閻','余','潘','杜','戴','夏','','汪','田','任','姜','范','方','石','姚','譚','廖','鄒','熊','金','陸','郝','孔','白','崔','康','毛','邱','秦','江','史','顧','侯','','孟','龍','萬','段','章','錢','湯','尹','黎','易','常','武','喬','賀','賴','龔','文']
??? xins+=['鮑俎','百里','碧魯','伯賞','北堂','陳林','淳于','第五','東方','東郭','東門','段干','獨(dú)孤','端木','范姜','哥舒','公良','公孫','公西','公冶','公羊','緱亢','谷梁','歸海','赫連','胡母','呼延','黃方','皇甫','即墨','夾谷','晉楚','況后','梁丘','令狐','陸費(fèi)','閭丘','閭邱','明哲','墨哈','慕容','萬俟','南宮','南郭','南門','年愛','歐陽','濮陽','漆雕','亓官','屈突','壤駟','汝鄢','司馬','司空','司寇','司徒','官','商牟','申屠','侍其','疏束','叔孫','太史','太叔','澹臺(tái)','涂欽','拓拔','完完','完顏','王子','聞人','微生','巫馬','烏雅','鐵筆','夏','許世','軒轅','閆法','羊舌','陽佟','耶律','有琴','尉遲','余佴','宇文','岳帥','樂正','宰父','子車','子陽','宗政','左丘','張簡','章佳','長孫','鄭余','仲孫','鐘離','諸葛','顓孫']
??? xins+=['付']
??? xins+=['李','王','張','劉','陳','楊','黃','孫','周','吳','徐','趙','朱','馬','胡','郭','林','何','高','梁','鄭','羅','宋','謝','唐','韓','曹','許','鄧','蕭','馮','曾','程','蔡','彭','潘','袁','于','董','余','蘇','葉','呂','魏','蔣','田','杜','丁','沈','姜','范','江','傅','','盧','汪','戴','崔','任','陸','廖','姚','方','金','邱','夏','譚','韋','賈','鄒','石','熊','孟','秦','閻','薛','侯','雷','白','龍','','郝','孔','邵','史','毛','常','萬','顧','賴','武','康','賀','嚴(yán)','尹','錢','施','牛','洪','龔','湯','陶','黎','溫','莫','易','樊','','文','安','殷','顏','莊','章','魯','倪','龐','邢','俞','翟','藍(lán)','聶','齊','向','申','葛','柴','伍','覃','駱','關(guān)','焦','柳','歐','','紀(jì)','尚','畢','耿','蘆','左','季','管','符','辛','苗','詹','曲','歐陽','靳','祁','路','涂','蘭','甘','裴','梅','童','翁','霍','游','阮','尤','岳','柯','牟','滕','谷','舒','卜','成','饒','寧','凌','盛','查','單','冉','鮑','華','包','屈','房','喻','解','蒲','衛(wèi)','簡','時(shí)','連','車','項(xiàng)','閔','鄔','吉','黨','陽','司','費(fèi)','蒙','席','晏','隋','古','強(qiáng)','穆','姬','宮','景','米','麥','談','柏','瞿','艾','沙','鄢','桂','竇','郁','繆','暢','鞏','卓','褚','欒','戚','全','婁','甄','郎','池','叢','邊','岑','農(nóng)','茍','遲','保','商','臧','','卞','虞','刁','冷','應(yīng)','匡','栗','仇','練','楚','揭','師','官','佟','封','燕','桑','巫','敖','原','植','鄺','仲','荊','儲(chǔ)','宗','','干','苑','寇','蓋','南','屠','鞠','榮','井','樂','銀','奚','明','麻','雍','花','聞','冼','木','郜','廉','衣','藺','和','冀','占','','門','帥','利','滿','陳生']
??? xins=set(xins)
??? print xins.__len__()
??? #f=codecs.open("./baijiaxin.txt","w","utf-8")
??? #for elem in [elem.strip() for elem in xins if elem.strip()<>""]:
??? #??? f.write("%s\n"%elem.strip())
??? #f.close()
??? f=codecs.open("./short_name_global_xin.txt","w","utf-8")
??? xins=[elem.strip() for elem in xins if elem.strip()<>""]???
??? for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
??? ??? a,b=line.strip().split("|||")
??? ??? a=a.strip()
??? ??? b=b.strip()
??? ??? if (a[:2]==b[:2] and a[:2] in xins) or (a[:1]==b[:1] and a[:1] in xins) and len(a)<5 and len(b)<5:
??? ??? ??? f.write(line)
??? f.close()
def extrace_names():
??? subject="""
??? <TD width=90><a href=/zaobao/chinese/surname/pages/story_bai2.html target=_blank>白</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_bi4.html target=_blank>畢</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/bian040600.html target=_blank>卞</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cai4.html target=_blank>蔡</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cao2.html target=_blank>曹</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cen2.html target=_blank>岑</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/chang040600.html target=_blank>常</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_che.html target=_blank>車</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_chen2.html target=_blank>陳</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/cheng030100.html target=_blank>成</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cheng2.html target=_blank>程</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_chi2.html target=_blank>池</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_deng4.html target=_blank></a>鄧</TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ding.html target=_blank>丁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fan4.html target=_blank>范</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fang.html target=_blank></a>方</TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/fan140600.html target=_blank>樊</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/fei140600.html target=_blank>費(fèi)</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_feng2.html target=_blank>馮</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fu2.html target=_blank>符</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fu4.html target=_blank>傅</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gan.html target=_blank>甘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gao.html target=_blank>高</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/ge170100.html target=_blank>葛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gong.html target=_blank>龔</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gu3.html target=_blank>古</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_guan.html target=_blank>關(guān)</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_guo.html target=_blank>郭</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_han2.html target=_blank>韓</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_he2.html target=_blank>何</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/he140600.html target=_blank>賀</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hong2.html target=_blank>洪</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hou2.html target=_blank>侯</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hu2.html target=_blank>胡</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hua4.html target=_blank>華</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_huang2.html target=_blank>黃</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/huo010600.html target=_blank>霍</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/ji030100.html target=_blank>姬</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jian3.html target=_blank>簡</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang1.html target=_blank>江</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang.html target=_blank>姜</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang3.html target=_blank>蔣</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jin.html target=_blank>金</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_kang.html target=_blank>康</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ke.html target=_blank>柯</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_kong3.html target=_blank>孔</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lai4.html target=_blank>賴</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lang170100.html target=_blank>郎</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/le140600.html target=_blank>樂</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lei2.html target=_blank>雷</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_li2.html target=_blank>黎</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_li3.html target=_blank>李</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lian2.html target=_blank>連</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lian140600.html target=_blank>廉</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/liang030100.html target=_blank>梁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_liao4.html target=_blank>廖</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lin2.html target=_blank>林</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ling2.html target=_blank>凌</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_liu2.html target=_blank>劉</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_liu3.html target=_blank>柳</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_long2.html target=_blank>龍</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lu2.html target=_blank>盧</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lu170100.html target=_blank>魯</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lu4.html target=_blank>陸</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lu140600.html target=_blank>路</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lv3.html target=_blank>呂</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_luo2.html target=_blank>羅</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_luo4.html target=_blank>駱</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ma3.html target=_blank>馬</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_mei2.html target=_blank>梅</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/meng140600.html target=_blank>孟</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_mo4.html target=_blank>莫</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/mu030100.html target=_blank>母</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/mu130700.html target=_blank>穆</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ni2.html target=_blank>倪</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ning2.html target=_blank>寧</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ou.html target=_blank>歐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/ou030100.html target=_blank>區(qū)</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_pan.html target=_blank>潘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_peng2.html target=_blank>彭</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_pu2.html target=_blank>蒲</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/pi130700.html target=_blank>皮</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qi130700.html target=_blank>齊</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qi030100.html target=_blank>戚</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_qian2.html target=_blank>錢</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qiang310500.html target=_blank>強(qiáng)</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_qing2.html target=_blank>秦</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qiu030100.html target=_blank>丘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_qiu.html target=_blank>邱</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_rao2.html target=_blank>饒</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ren2.html target=_blank>任</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shen3.html target=_blank>沈</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/sheng010600.html target=_blank>盛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shi.html target=_blank>施</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shi2.html target=_blank>石</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/shi300500.html target=_blank>時(shí)</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shi3.html target=_blank>史</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/situ030100.html target=_blank>司徒</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_su.html target=_blank>蘇</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_sun.html target=_blank>孫</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tan2.html target=_blank>譚</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tang.html target=_blank>湯</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tang2.html target=_blank>唐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tao2.html target=_blank>陶</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tian2.html target=_blank>田</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/tong040600.html target=_blank>童</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tu2.html target=_blank>涂</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wang2.html target=_blank>王</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/wei010600.html target=_blank>危</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wei3.html target=_blank>韋</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/wei180100a.html target=_blank>衛(wèi)</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wei4.html target=_blank>魏</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wen.html target=_blank>溫</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wen2.html target=_blank>文</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_weng.html target=_blank>翁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu.html target=_blank>巫</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu1.html target=_blank>鄔</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu2.html target=_blank>吳</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu3.html target=_blank>伍</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu3a.html target=_blank>武</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/xi040600.html target=_blank>席</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xia4.html target=_blank>夏</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xiao.html target=_blank>蕭</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xie4.html target=_blank>謝</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xin.html target=_blank>辛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xing2.html target=_blank>邢</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xu2.html target=_blank>徐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xu3.html target=_blank>許</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xue.html target=_blank>薛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yan2.html target=_blank>嚴(yán)</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yan2a.html target=_blank>顏</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yang2.html target=_blank>楊</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ye4.html target=_blank>葉</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yi4.html target=_blank>易</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yin020600.html target=_blank>殷</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_you2.html target=_blank>尤</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yu010600.html target=_blank>于</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yu2.html target=_blank>余</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yu020600a.html target=_blank>俞</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yu020600.html target=_blank>虞</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/yuan310500.html target=_blank>元</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yuan2.html target=_blank>袁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yue030100.html target=_blank>岳</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yun2.html target=_blank>云</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zeng.html target=_blank>曾</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhan.html target=_blank>詹</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhang.html target=_blank>張</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhang1.html target=_blank>章</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhao4.html target=_blank>趙</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zheng4.html target=_blank>鄭</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhong.html target=_blank>鐘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhou.html target=_blank>周</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhou1.html target=_blank>鄒</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhu.html target=_blank>朱</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/zhu180100.html target=_blank>褚</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhuang.html target=_blank>莊</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhuo.html target=_blank>卓</a></TD>
"""
??? result = re.findall(r"target=_blank>(?P<name>[\s\S]*?)</TD>", subject)
??? print ("['"+"','".join(result)+"']").replace("</a>","")
??? html=u"""
1李
2王
3張
4劉
5陳
6楊
7趙
8黃
9周
10吳
11徐
12孫
13胡
14朱
15高
16林
17何
18郭
19馬
20羅
21梁
22宋
23鄭
24謝
25韓
26唐
27馮
28于
29董
30蕭
31程
32曹
33袁
34鄧
35許
36傅
37沈
38曾
39彭
40呂
41蘇
42盧
43蔣
44蔡
45賈
46丁
47魏
48薛
49葉
50閻
51余
52潘
53杜
54戴
55夏
56鐘
57汪
58田
59任
60姜
61范
62方
63石
64姚
65譚
66廖
67鄒
68熊
69金
70陸
71郝
72孔
73白
74崔
75康
76毛
77邱
78秦
79江
80史
81顧
82侯
83邵
84孟
85龍
86萬
87段
88章
89錢
90湯
91尹
92黎
93易
94常
95武
96喬
97賀
98賴
99龔
100文
"""
??? list2=[]
??? for line in html.strip().split(" \n"):
??? ??? list2.append("'"+line[-1]+"'")
??? print "[" +? ",".join(list2) + "]"
??? html=u"""
鮑俎(bao zu)、百里(bai li)、碧魯(bi lu)、伯賞(bo shang)、北堂(bei tang)
單于(chan yu)、陳林(chen lin)、淳于(chun yu)、
第五(di wu)、 東方(dong fang)、東郭(dong guo)、東門(dong men)、段干(duan gan)、獨(dú)孤(du gu)、端木(duan mu)、
范姜(fan jiang)、
哥舒(ge shu)、公良(gong liang)、公孫(gong sun)、公西(gong xi)、公冶(gong yan)、公羊(gong yang)、緱亢(gou kang)、谷梁(gu liang)、歸海(gui hai)、
赫連(he lian)、胡母(hu mu)、呼延(hu yan)、黃方(huang fang)、皇甫(huang fu)、
即墨(ji mo)、夾谷(jia gu)、晉楚(jin chu)、
況后(kuang hou)、
梁丘(liang qiu)、令狐(ling hu)、陸費(fèi)(lu fei)、閭丘(lv qiu)、閭邱(lv qiu)、
明哲(ming zhe)、墨哈(mo ha)、慕容(mu rong)、萬俟(mò qí)
鈉蘭(na lan)、南宮(nan gong)、南郭(nan guo)、南門(nan men)、年愛(nian ai)、
歐陽(ou yang)、
濮陽(pu yang)、
漆雕(qi diao)、亓官(qi guan)、屈突(qu tu)、
壤駟(rang si)、汝鄢(ru yan)、
司馬(si ma)、司空(si kong)、司寇(si kou)、司徒(si tu)、上官(shang guan)、商牟(shang mou)、申屠(shen tu)、侍其(shi qi)、疏束(shu su)、叔孫(shu sun)、
太史(tai shi)、太叔(tai shu)、澹臺(tái)(tan tai)、涂欽(tu qin)、拓拔(tuo ba)、
完完(wan wan)、完顏(wan yan)、王子(wang zi)、聞人(wen ren)、微生(wei sheng)、巫馬(wu ma)、烏雅(wu ya)、鐵筆(tie bi)
西門(xi men)、夏侯(xia hou)、許世(xu shi)、軒轅(xuan yuan)、
閆法(yan fa)、羊舌(yang she)、陽佟(yang tong)、耶律(ye lv)、有琴(you qin)、尉遲(yu chi)、余佴(yu er)、宇文(yu wen)、岳帥(yue shuai)、樂正(yue zheng)、
宰父(zai fu)、子車(zi che)、子陽(zi yang)、宗政(zong zheng)、左丘(zuo qiu)、張簡(zhang jian)、章佳(zhang jia)、長孫(zhang sun)、鄭余(zheng yu)、仲孫(zhong sun)、鐘離(zhong li)、諸葛(zhu ge)、顓孫(zhuan sun)、
"""
??? list3=[]
??? for line in html.strip().split("\r\n"):
??? ??? list3.extend(["'" + elem.strip()[:2] + "'" for elem in line.split("、") if elem.strip()<>""])
??? for elem in list3:
??? ??? print elem
??? print "[" +? ",".join(list3) + "]"
???
??? html=u"""
李 王 張 劉 陳 楊 黃 孫 周 吳
徐 趙 朱 馬 胡 郭 林 何 高 梁
鄭 羅 宋 謝 唐 韓 曹 許 鄧 蕭
馮 曾 程 蔡 彭 潘 袁 于 董 余
蘇 葉 呂 魏 蔣 田 杜 丁 沈 姜
范 江 傅 鐘 盧 汪 戴 崔 任 陸
廖 姚 方 金 邱 夏 譚 韋 賈 鄒
石 熊 孟 秦 閻 薛 侯 雷 白 龍
段 郝 孔 邵 史 毛 常 萬 顧 賴
武 康 賀 嚴(yán) 尹 錢 施 牛 洪 龔
湯 陶 黎 溫 莫 易 樊 喬 文 安
殷 顏 莊 章 魯 倪 龐 邢 俞 翟
藍(lán) 聶 齊 向 申 葛 柴 伍 覃 駱
關(guān) 焦 柳 歐 祝 紀(jì) 尚 畢 耿 蘆
左 季 管 符 辛 苗 詹 曲 歐陽 靳
祁 路 涂 蘭 甘 裴 梅 童 翁 霍
游 阮 尤 岳 柯 牟 滕 谷 舒 卜
成 饒 寧 凌 盛 查 單 冉 鮑 華
包 屈 房 喻 解 蒲 衛(wèi) 簡 時(shí) 連
車 項(xiàng) 閔 鄔 吉 黨 陽 司 費(fèi) 蒙
席 晏 隋 古 強(qiáng) 穆 姬 宮 景 米
麥 談 柏 瞿 艾 沙 鄢 桂 竇 郁
繆 暢 鞏 卓 褚 欒 戚 全 婁 甄
郎 池 叢 邊 岑 農(nóng) 茍 遲 保 商
臧 佘 卞 虞 刁 冷 應(yīng) 匡 栗 仇
練 楚 揭 師 官 佟 封 燕 桑 巫
敖 原 植 鄺 仲 荊 儲(chǔ) 宗 樓 干
苑 寇 蓋 南 屠 鞠 榮 井 樂 銀
奚 明 麻 雍 花 聞 冼 木 郜 廉
衣 藺 和 冀 占 公 門 帥 利 滿
陳生
"""
??? list4=[]
??? for line in html.split(" "):
??? ??? if line.strip()<>"" and line.strip().isdigit()==False:
??? ??? ??? list4.append("'" + line.strip()+"'")
??? print list4.__len__()
??? print "[" +? ",".join(list4) + "]"
def is_chinese_or_space(str):
??? """
??? >>> is_chinese_or_space(u"中國 人")
??? True
???
??? >>> is_chinese_or_space(u"中國 人1")
??? False
???
??? >>> is_chinese_or_space(u"華為huawei")
??? False
??? >>> is_chinese_or_space(u"游泳褲xxxl")
??? False
??? """
??? if type(str)==type(""):
??? ??? str=str.encode("utf-8","ignore")
??? r=[]
??? for char in str:
??? ??? r.append(_is_chinese_or_space(char))
??? if False in r:
??? ??? return False
??? return True
def is_english_or_space(str):
??? """
??? >>> is_english_or_space(u"abc def1")
??? False
???
??? >>> is_english_or_space(u"abc def")
??? True
??? >>> is_english_or_space(u"游泳褲xxxl")
??? False
???
??? >>> is_english_or_space(u"茶具")
??? False
??? """
??? if type(str)==type(""):
??? ??? str=str.encode("utf-8","ignore")
??? r=[]
??? for char in str:
??? ??? r.append(_is_english_or_space(char))
??? if False in r:
??? ??? return False
??? return True
def _is_chinese_or_space(uchar):
??? """
??? >>> is_chinese_or_space(u"人")
??? True
???
??? >>> is_chinese_or_space(u"1")
??? False
???
??? >>> is_chinese_or_space(u" ")
??? True
??? """
??? if is_chinese(uchar) or uchar==u" ":
??? ??? return True
??? else:
??? ??? return False
def _is_english_or_space(uchar):
??? """
??? >>> _is_english_or_space(u"1")
??? False
???
??? >>> _is_english_or_space(u"a")
??? True
???
??? >>> _is_english_or_space(u" ")
??? True
???
??? >>> _is_english_or_space(u"中")
??? False
??? """
??? if is_chinese(uchar):
??? ??? return False
??? if uchar.isalpha() or uchar==u" ":
??? ??? return True
??? return False
???
def find_one_side_chinese_and_another_side_is_english():
??? f=codecs.open("./short_name_global_chinese_english.txt","w","utf-8")
??? for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
??? ??? a,b=line.strip().split("|||")
??? ??? a=a.strip()
??? ??? b=b.strip()
??? ??? #print a,b
??? ??? #print is_chinese_or_space(a)
??? ??? #print is_english_or_space(b)
??? ??? #if (is_chinese_or_space(a)==True and is_english_or_space(b)==True):
??? ??? #??? print line
??? ??? #??? time.sleep(100000000)
??? ??? if (is_chinese_or_space(a)==True and is_english_or_space(b)==True) or \
??? ??? ??? ?(is_chinese_or_space(b)==True and is_english_or_space(a)==True):
??? ??? ???
??? ??? ??? f.write(line)
??? f.close()
??? ??? ??? ???
if __name__=="__main__":
??? doctest.testmod()
#??? read_relevent_words()
#??? parser_one_line_one_words2()
#??? compare_pareser_one_line_one_words_result_lost_line_for_tmp()
#??? build_invert_index()
#??? build_word_segments_hash_map()
#??? final_find_synomns_out()???
#??? interactive_mode()
#??? print _filter("龜 鹿 補(bǔ) 腎丸 4.5 g*12 袋 水 蜜丸 / 盒 [ 補(bǔ) 腎 失眠 體弱 疲乏 壯 陽 ]")
#??? print _filter("龜 牌 ( turtle ) 硬殼 防水 全效 蠟 g-223r")
#??? post_process_wname_segments_illegal_characters()
#??? filter_synonym_result()???
#??? test_redis_is_ready()
#??? pivot_query_relvent_word_order_and_intersation_size()
#??? find_short_name()
#??? find_short_name2()
#??? test_sorted()
#??? find_only_one_word_difference()
#??? extrace_names()
#??? find_human_names()
??? find_one_side_chinese_and_another_side_is_english()
#??? print is_english_or_space(u"茶具")
更多文章、技術(shù)交流、商務(wù)合作、聯(lián)系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯(lián)系: 360901061
您的支持是博主寫作最大的動(dòng)力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點(diǎn)擊下面給點(diǎn)支持吧,站長非常感激您!手機(jī)微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點(diǎn)擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
