清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
#!/usr/bin/perl use strict; #use warnings; use encoding 'utf8'; use Data::Dumper; sub dump_chars(); sub dump_sentenses(); sub dump_index(); sub recount_chars; sub mark_chars; sub dump_result; sub count_unsed_chars; sub count_weight; # 读取字频表 my %chars; open(CHAR_FILE, '<', 'lex-chars.lex'); while (my $line = <CHAR_FILE>) { chomp($line); my @items = split(/\//, $line); print "bad line: $line\n" if (scalar(@items) != 3); $chars{$items[0]} = $items[2]; } close(CHAR_FILE); # 排序,按字频输出 my @sorted_chars = sort { $chars{$b} <=> $chars{$a} } keys %chars; #dump_chars(); # 筛选出频率前1000个字 my %top_chars; for (my $i = 0; $i < 2000; $i++) { $top_chars{$sorted_chars[$i]} = 1; } # 读取名句语料库 my %sentenses; my %char_sentenses; my %first_chars; my %last_chars; while (my $file = <r01*>) { # print "reading $file\n"; open(FILE, '<', $file); while (my $line = <FILE>) { chomp($line); $line =~ s/^\s+//g; $line =~ s/\s+$//g; next if (!$line); $line =~ s/。$//; $line =~ s/!$//; $line =~ s/?$//; $line =~ s/,$//; next if ($line =~ /^—/); next if (length($line) < 10); next if (length($line) > 16); next if ($line =~ /^(/); next if ($line =~ /《/); next if ($line =~ /[0-9A-Za-z]/); next if ($line =~ /^,/); next if ($line =~ /-/); $sentenses{$line}++; my $first_char = substr($line, 0, 1); my $last_char = substr($line, length($line) - 1, 1); $first_chars{$first_char} = 1; $last_chars{$last_char} = 1; } close(FILE); } # 排序,按句频输出 my @sorted_sentenses = sort { $sentenses{$b} <=> $sentenses{$a} } keys %sentenses; #dump_sentenses(); # 过滤只出现过一次的句子 #while ($sentenses{$sorted_sentenses[$#sorted_sentenses]} < 2) { # pop(@sorted_sentenses); #} print 'total sentenses: ' . scalar(@sorted_sentenses) . "\n"; # 过滤末字无法接的句子 foreach my $sentense (keys(%sentenses)) { my $last_char = substr($sentense, length($sentense) - 1, 1); if (!$first_chars{$last_char}) { #print "delete $sentense\n"; delete($sentenses{$sentense}); } } print 'total sentenses: ' . scalar(%sentenses) . "\n"; # 过滤首字无法接的句子 foreach my $sentense (keys(%sentenses)) { my $first_char = substr($sentense, 0, 1); if (!$last_chars{$first_char}) { #print "delete $sentense\n"; delete($sentenses{$sentense}); } } print 'total sentenses: ' . scalar(%sentenses) . "\n"; foreach my $sentense (keys(%sentenses)) { # 建立首字索引 my $first_char = substr($sentense, 0, 1); if (!$char_sentenses{$first_char}) { $char_sentenses{$first_char} = []; } push($char_sentenses{$first_char}, $sentense); } # 输出索引信息 #dump_index(); # 统计top字出现在各句子的次数 my %char_in_sentense; foreach my $char (keys(%top_chars)) { my $count = 0; foreach my $sentense (@sorted_sentenses) { $count++ if (index($sentense, $char) >= 0); } $char_in_sentense{$char} = $count; } my @sorted_char_in_sentenses = sort { $char_in_sentense{$a} <=> $char_in_sentense{$b} } keys %char_in_sentense; #my $i = 0; #foreach my $c (@sorted_char_in_sentenses) { # $i++; # print "$i:\t$c\t " . $char_in_sentense{$c} . "\n"; #} # 以任意一个句子开头,接龙,首字不重复,直至不能接或首尾相连 my %used_first_chars; my %used_sentenses; my %used_chars; my $char_count = 0; my @sentense_queue; my $max_char_count = 0; my $max_depth = 0; my $max_loop_depth = 0; my $lastlog = time(); $| = 1; # disable IO buffer sub traverse { my ($sentense, $depth, $progress) = @_; # 每分钟显示最新进度 if (time() - $lastlog > 60) { print "sentense: $sentense, depth: $depth, count: $char_count, total progress: $progress\n"; $lastlog = time(); } return if ($used_sentenses{$sentense}); $depth++; $used_sentenses{$sentense} = 1; # 标记句子已使用 mark_chars($sentense); push(@sentense_queue, $sentense); if ($depth != scalar(@sentense_queue)) { print "$sentense: $depth\n"; dump_result(); exit; } if ($depth > $max_depth) { $max_depth = $depth; print "new depth: $depth\n"; dump_result(); } # 找到覆盖1000个字的方案(实际上100句职能覆盖500多个字) if ($char_count >= 1000) { if ($char_count >= 1000) { print "found one: $depth\n"; dump_result(); exit; } } if ($depth >= 100) { if ($char_count > $max_char_count) { $max_char_count = $char_count; print "new record: $max_char_count\n"; dump_result(); } #print "too deep: $char_count\n"; pop(@sentense_queue); return; } my $last_char = substr($sentense, length($sentense) - 1, 1); #print $last_char; # 找到闭环 if ($last_char eq substr($sentense_queue[0], 0, 1)) { if ($depth > $max_loop_depth) { $max_loop_depth = $depth; print "new loop: $depth\n"; dump_result(); } pop(@sentense_queue); return; } if ($char_sentenses{$last_char}) { my %child_sentenses; foreach my $s (@{$char_sentenses{$last_char}}) { if (!$used_sentenses{$s}) { my $count = count_weight($s); if ($count > 0) { $child_sentenses{$s} = $count; } } } my @sorted_children = sort { $child_sentenses{$b} <=> $child_sentenses{$a} } keys %child_sentenses; my $i = 0; my $len = scalar(@sorted_children); foreach my $s (@sorted_children) { $i++; my $first_char = substr($s, 0, 1); if (!$used_first_chars{$first_char}) { $used_first_chars{$first_char} = 1; traverse($s, $depth, $progress * ($len - $i + 1)); delete($used_sentenses{$s}); delete($used_first_chars{$first_char}); recount_chars(); } } pop(@sentense_queue); } else { pop(@sentense_queue); #print "dead at $sentense: $depth; last char:$last_char\n"; } } # 挑选第一个句子 my %child_sentenses; foreach my $s (keys(%sentenses)) { my $count = count_weight($s); if ($count > 0) { $child_sentenses{$s} = $count; } } my @sorted_children = sort { $child_sentenses{$b} <=> $child_sentenses{$a} } keys %child_sentenses; my $i = 0; my $len = scalar(@sorted_children); foreach my $s (@sorted_children) { $i++; my $first_char = substr($s, 0, 1); $used_first_chars{$first_char} = 1; traverse($s, 0, $len - $i + 1); delete($used_sentenses{$s}); delete($used_first_chars{$first_char}); recount_chars(); print "depth: 0, count: $char_count, total progress: " . ($len - $i + 1) . ", progress: $i/$len, weight: " . $child_sentenses{$s} . "\n"; } sub dump_chars() { my $i = 0; foreach my $char (@sorted_chars) { $i++; print "$i\t $char\t $chars{$char}\n"; } } sub dump_sentenses() { my $i = 0; foreach my $sentense (@sorted_sentenses) { $i++; print "$i\t $sentense\t $sentenses{$sentense}\n"; } } sub dump_index() { foreach my $char (keys(%char_sentenses)) { print "$char: " . scalar(@{$char_sentenses{$char}}) . "\n"; } } sub mark_chars { my $sentense = shift; for (my $i = 0, my $len = length($sentense); $i < $len; $i++) { my $char = substr($sentense, $i, 1); if ($top_chars{$char} && !$used_chars{$char}) { $char_count++; } $used_chars{$char} = 1; } } sub count_weight { my $sentense = shift; my $weight = 0; for (my $i = 0, my $len = length($sentense); $i < $len; $i++) { my $char = substr($sentense, $i, 1); if ($top_chars{$char} && !$used_chars{$char}) { if ($char_in_sentense{$char}) { $weight += 1 / $char_in_sentense{$char}; } } } return $weight; } sub count_unsed_chars { my $sentense = shift; my $count = 0; for (my $i = 0, my $len = length($sentense); $i < $len; $i++) { my $char = substr($sentense, $i, 1); if ($top_chars{$char} && !$used_chars{$char}) { $count++; } } return $count; } sub recount_chars { # 清空计算器 undef %used_chars; $char_count = 0; foreach my $sentense (keys(%used_sentenses)) { mark_chars($sentense); } } sub dump_result { print "chars: " . keys(%used_chars) . "\n"; print "char_count: $char_count\n"; my $i = 0; foreach my $sentense (@sentense_queue) { $i++; print "$i: $sentense\n"; } }