python迭代器itertools(u010223750)
引言itertools是python中的迭代器,有非常强大的功能,掌握这个能够减少很多的编码量,需要写个博文mark一下
Lets begindef chain(*iterables): # chain('ABC', 'DEF') --> A B C D E F for it in iterables: for element in it: yield element如上面的函数定义,chain是第一个迭代数组的每个元素展开,注意到这个是将第一个迭代数组,所以chain展开的是第二维的数组元素,距离来说明:
>>> a=[['abc','def']]>>> for item in itertools.chain(*a):... print item...abcdef>>> b=[[['abc','abc']]]>>> for item in itertools.chain(*b):... print item...['abc', 'abc']上面可以明显看出chain的区别。在itertools中和chain有一样功能的是from_iterable(iterable)函数
def combinations(iterable, r): # combinations('ABCD', 2) --> AB AC AD BC BD CD # combinations(range(4), 3) --> 012 013 023 123 pool = tuple(iterable) n = len(pool) if r > n: return indices = range(r) yield tuple(pool for i in indices) while True: for i in reversed(range(r)): if indices != i + n - r: break else: return indices += 1 for j in range(i+1, r): indices[j] = indices[j-1] + 1 yield tuple(pool for i in indices)- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
由函数定义可以看出,combinations的作用是在iterable数组中产生长度为r的子数组,且位置是唯一的,也就是不能交换的,比如AB和BA是不一样的,类似于排列组合的C(m,n).
这个使用起来比较简单,吸引我的是其实现方式,我觉得很不错,是一个很好的实现方式,大体思路是将每个位置的数从其开始位置移动到其最大能够移动的位置,注意这种移动是从后往前移动
- combinations_with_replacement
def combinations_with_replacement(iterable, r): # combinations_with_replacement('ABC', 2) --> AA AB AC BB BC CC pool = tuple(iterable) n = len(pool) if not n and r: return indices = [0] * r yield tuple(pool for i in indices) while True: for i in reversed(range(r)): if indices != n - 1: break else: return indices[i:] = [indices + 1] * (r - i) yield tuple(pool for i in indices)这个和前一个的区别的是可以重复的选取自身,因此其实现方式和上面的区别是,每个元素都能移动到原数组的最后一个元素,而且当扫描归为之后,是将i位置及其之后的都置为indices+1的数值
>>> for item in itertools.combinations_with_replacement('ABCD',3):... print item...('A', 'A', 'A')('A', 'A', 'B')('A', 'A', 'C')('A', 'A', 'D')('A', 'B', 'B')('A', 'B', 'C')('A', 'B', 'D')('A', 'C', 'C')('A', 'C', 'D')('A', 'D', 'D')('B', 'B', 'B')('B', 'B', 'C')('B', 'B', 'D')('B', 'C', 'C')('B', 'C', 'D')('B', 'D', 'D')('C', 'C', 'C')('C', 'C', 'D')('C', 'D', 'D')('D', 'D', 'D')- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
def compress(data, selectors): # compress('ABCDEF', [1,0,1,0,1,1]) --> A C E F return (d for d, s in izip(data, selectors) if s)compress的作用是选取符合selectors条件的data数据
def count(start=0, step=1): # count(10) --> 10 11 12 13 14 ... # count(2.5, 0.5) -> 2.5 3.0 3.5 ... n = start while True: yield n n += stepcount的作用是每次返回一个从start开始步长为step的数组,应用如下:
>>> a=itertools.count(0,1)>>> a.next()0>>> a.next()1>>> a.next()2>>>def cycle(iterable): # cycle('ABCD') --> A B C D A B C D A B C D ... saved = [] for element in iterable: yield element saved.append(element) while saved: for element in saved: yield element用法:循环生成iterable的元素:
>>> a=itertools.cycle('ABCD')>>> a.next()'A'>>> a.next()'B'>>> a.next()'C'>>> a.next()'D'>>> a.next()'A'>>> a.next()'B'>>> a.next()'C'>>> a.next()'D'- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
def dropwhile(predicate, iterable): # dropwhile(lambda x: x<5, [1,4,6,4,1]) --> 6 4 1 iterable = iter(iterable) for x in iterable: if not predicate(x): yield x break for x in iterable: yield x从函数定义可以看出来,返回iterable第一个不满足predicate的数组值以及之后的数组值:
>>> a=itertools.dropwhile(lambda x:x>1,[3,2,1,1])>>> a.next()1>>> a.next()1def ifilter(predicate, iterable): # ifilter(lambda x: x%2, range(10)) --> 1 3 5 7 9 if predicate is None: predicate = bool for x in iterable: if predicate(x): yield x返回符合条件的元素
- ifilterfalse
和ifilter相反,返回不符合条件的的元素
- imap
函数定义:
def imap(function, *iterables): # imap(pow, (2,3,10), (5,2,3)) --> 32 9 1000 iterables = map(iter, iterables) while True: args = [next(it) for it in iterables] if function is None: yield tuple(args) else: yield function(*args)功能:给iterables的元素添加上function功能
def izip(*iterables): # izip('ABCD', 'xy') --> Ax By iterators = map(iter, iterables) while iterators: yield tuple(map(next, iterators))功能:成对匹配
演示:
>>> a='ABC'>>> b='123'>>> c='xy'>>> d=itertools.izip(a,b,c)>>> d.next()('A', '1', 'x')>>> d.next()('B', '2', 'y')class ZipExhausted(Exception): passdef izip_longest(*args, **kwds): # izip_longest('ABCD', 'xy', fillvalue='-') --> Ax By C- D- fillvalue = kwds.get('fillvalue') counter = [len(args) - 1] def sentinel(): if not counter[0]: raise ZipExhausted counter[0] -= 1 yield fillvalue fillers = repeat(fillvalue) iterators = [chain(it, sentinel(), fillers) for it in args] try: while iterators: yield tuple(map(next, iterators)) except ZipExhausted: pass- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
和izip差不多,不过返回的是最长的匹配,不满足长度的使用指定的fillvalue代替
>>> c=itertools.izip_longest('ABCD','xy','1',fillvalue='*')>>> c.next()('A', 'x', '1')>>> c.next()('B', 'y', '*')>>> c.next()('C', '*', '*')>>> c.next()('D', '*', '*')def permutations(iterable, r=None): # permutations('ABCD', 2) --> AB AC AD BA BC BD CA CB CD DA DB DC # permutations(range(3)) --> 012 021 102 120 201 210 pool = tuple(iterable) n = len(pool) r = n if r is None else r if r > n: return indices = range(n) cycles = range(n, n-r, -1) yield tuple(pool for i in indices[:r]) while n: for i in reversed(range(r)): cycles -= 1 if cycles == 0: indices[i:] = indices[i+1:] + indices[i:i+1] cycles = n - i else: j = cycles indices, indices[-j] = indices[-j], indices yield tuple(pool for i in indices[:r]) break else: return- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
这个函数和combination有着类似的功能,不同的是,permutation不强调元素的顺序概念,AB和BA是不一样的,这个和排列组合里面的A(m,n)差不多
功能展示:
>>> it=itertools.permutations('ABCD',3)>>> for item in it:... print item...('A', 'B', 'C')('A', 'B', 'D')('A', 'C', 'B')('A', 'C', 'D')('A', 'D', 'B')('A', 'D', 'C')('B', 'A', 'C')('B', 'A', 'D')('B', 'C', 'A')('B', 'C', 'D')('B', 'D', 'A')('B', 'D', 'C')('C', 'A', 'B')('C', 'A', 'D')('C', 'B', 'A')('C', 'B', 'D')('C', 'D', 'A')('C', 'D', 'B')('D', 'A', 'B')('D', 'A', 'C')('D', 'B', 'A')('D', 'B', 'C')('D', 'C', 'A')('D', 'C', 'B')- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
官方定义的函数我没有怎么弄明白,于是写了一个递归的C++版本permutation:
#include<iostream>using namespace std;char str[4]={'A','B','C','D'};char result[4];bool flag[4]={false,false,false,false};int fix=3;void print(){ for(int i=1;i<=fix;i++){ cout<<result; } cout<<endl;}void f(int position){ for(int i=0;i<4;i++){ int t=-1; if(flag==false){ flag=true; t=i; result[position]=str; if(position==fix){ print(); flag=false; continue; }else{ f(position+1); } } if(t!=-1){ flag[t]=false; } }}int main() { f(1); return 0;}- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
测试结果如下:
ABCABDACBACDADBADCBACBADBCABCDBDABDCCABCADCBACBDCDACDBDABDACDBADBCDCADCB- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
两者得到的结果是一样的。
函数定义:
def product(*args, **kwds): # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111 pools = map(tuple, args) * kwds.get('repeat', 1) result = [[]] for pool in pools: result = [x+[y] for x in result for y in pool] for prod in result: yield tuple(prod)product其实就是元素数组之间的相乘,展示如下:
>>> a=itertools.product('ABCD','xyz')>>> for item in a:... print item...('A', 'x')('A', 'y')('A', 'z')('B', 'x')('B', 'y')('B', 'z')('C', 'x')('C', 'y')('C', 'z')('D', 'x')('D', 'y')('D', 'z')def repeat(object, times=None): # repeat(10, 3) --> 10 10 10 if times is None: while True: yield object else: for i in xrange(times): yield object功能,返回times个object,看下面的运用:
>>> list(itertools.imap(pow,xrange(10),itertools.repeat(2)))[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]>>> list(itertools.imap(pow,xrange(10),[2,2,2,2,2,2,2,2,2,2]))[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]def starmap(function, iterable): # starmap(pow, [(2,5), (3,2), (10,3)]) --> 32 9 1000 for args in iterable: yield function(*args)这个和imap的区别是,imap选取每个参数的相同位置的元素元组作为function的参数,而starmap是将每个参数当做function的参数
- takewhile
这个和dropwhile相反,dropwhile是找到第一个不满足predicate的元素以及其之后的元素,而takewhile是返回满足predicate的元素,如果遇到不满足的则返回
函数定义:
def takewhile(predicate, iterable): # takewhile(lambda x: x<5, [1,4,6,4,1]) --> 1 4 for x in iterable: if predicate(x): yield x else: break演示:
>>> for item in itertools.takewhile(lambda x: x<5, [1,4,6,4,1]):... print item...14具体的运用例子场景:对于一段文本,我们需要统计每个单词的词频。
方法:运用nltk的分句和分词功能后,运用FreqDist进行统计词频
代码:
import nltkimport itertoolsdef process(str): sentences=itertools.chain(*[nltk.sent_tokenize(str.decode('utf-8').lower())]) words_dict=itertools.chain(*[nltk.word_tokenize(sen) for sen in sentences]) fdist=nltk.FreqDist(words_dict) print fdist.most_common(10) fdist.plot(50,cumulative=True)if __name__=='__main__': text="My interpretation: Gom Jabbar is an ancient non-Latin incantation in HPMORverse which Draco just happened to have researched in old dark tomes, and far predates the writings we're familiar with. Frank Herbert somehow picked up on the phrase, either overhearing it in passing or perhaps by having some actual knowledge of the magical world (e.g. magical relative or somesuch)." process(text)代码打印出这段文字的频率最高的10个单词,并作出前50个词的累积频率图,结果如下:
[(u'in', 3), (u'.', 3), (u'the', 3), (u',', 2), (u'magical', 2), (u'or', 2), (u'jabbar', 1), (u'and', 1), (u'writings', 1), (u'interpretation', 1)]
|
|