levenshtein_distance(字符串相似度算法)
# -*- coding: utf8 -*- #字符串相似度算法 #!/usr/bin/env python __author__ = "Administrator" def levenshtein(a,b): "Calculates the Levenshtein distance between a and b." n, m = len(a), len(b) if n > m: # Make sure n <= m, to use O(min(n,m)) space a,b = b,a n,m = m,n current = range(n+1) for i in range(1,m+1): previous, current = current, [i]+[0]*n for j in range(1,n+1): add, delete = previous[j]+1, current[j-1]+1 change = previous[j-1] if a[j-1] != b[i-1]: change = change + 1 current[j] = min(add, delete, change) return current[n] def levenshtein_distance(first, second): """Find the Levenshtein distance between two strings.""" if len(first) > len(second): first, second = second, first if len(second) == 0: return len(first) first_length = len(first) + 1 second_length = len(second) + 1 distance_matrix = [range(second_length) for x in range(first_length)] for i in range(1, first_length): for j in range(1, second_length): deletion = distance_matrix[i-1][j] + 1 insertion = distance_matrix[i][j-1] + 1 substitution = distance_matrix[i-1][j-1] if first[i-1] != second[j-1]: substitution += 1 distance_matrix[i][j] = min(insertion, deletion, substitution) return distance_matrix[first_length-1][second_length-1]
levenshtein_distance(字符串相似度算法):
所谓levenshtein distance即将一个字符串转换成另一个字符串所需要的最少修改次数,一般包括对字符的3种操作:1、删除。2、修改。3、增加。
如‘天通苑’和‘天通西苑’的度为1,执行一次插入操作。
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。
- 上一篇: Base64加密和解密解决方案(个人项目经验)
- 下一篇: postgresql行转列并拼接字符串