展开全部
#include<IOSTREAM.H>
#include<STDIO.H>
#include<STRING.H>
#include<STDLIB.H>
#include<CONIO.H>
#define WORDLEN 50
#include <IO.H>
#include<TIME.H>
#define PATH 2000
struct node{
char word[WORDLEN];
float num;
struct node *pre,*next;
};
clock_t start,end;
struct node *headptr=(struct node*)malloc(sizeof(struct node));
int main(int argc,char *argv[])
{
int output(char*);
void searchAllFiles(char*);
strcpy(headptr->word," ");
headptr->num=0;//用来存储单词的总数
headptr->pre=headptr;
headptr->next=headptr;
char paths[PATH],patht[PATH];
// printf("请输入要查找的文件路径:\n");
//gets(paths);
if(argc<3)
{
printf("请输入程序名,搜索路径及结果文件路径:\n");
return 0;
}
strcpy(paths,argv[1]);
//printf("请输入要输出结果的文件路径:\n");
//gets(patht);
strcpy(patht,argv[2]);
start=clock();
searchAllFiles(paths);
output(patht);
return 0;
}
int searchword(char *fpath)
{
char filepath[PATH];
strcpy(filepath,fpath);
FILE *fptr;
fptr=fopen(filepath,"r");
char word[20]="";
char sword[2];
char ch;
int InsertAndSort(char*);
while((ch=fgetc(fptr))!=EOF)
{
if(!((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')||(ch=='?')||(ch=='!')||(ch=='-'))&&(!strcmp(word,"")))//所得到的字符不是要处理的字符且word数组为空
continue;
if((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')||(ch=='-'))//得到的字符是字母或连字符
{
sword[0]=tolower(ch);//将其变成小写
strncat(word,sword,1);
continue;
}
if(!((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')||(ch=='-')||(ch=='?')||(ch=='!'))&&(strcmp(word,"")))//所得到的字符不是要处理的字符且word数组为非空
{
InsertAndSort(word);
strcpy(word,"");//重置word数组为空
continue;
}
if(((ch=='?')||(ch=='!'))&&(!strcmp(word,"")))//得到'!'或'?'且word数组为空
{
sword[0]=ch;
strncat(word,sword,1);
InsertAndSort(word);
strcpy(word,"");//重置word数组为空
continue;
}
if(((ch=='?')||(ch=='!'))&&(strcmp(word,"")))//得到'!'或'?'且word数组非空
{
InsertAndSort(word);//先处理word数组
strcpy(word,"");//重置word数组为空
sword[0]=ch;
strncat(word,sword,1);
InsertAndSort(word);
strcpy(word,"");//重置word数组为空
continue;
}
continue;
}
return 0;
}
int InsertAndSort(char* word)
{
struct node *nodeptr;//以下部分应该抽象成一个函数
nodeptr=headptr->next;
while(nodeptr!=headptr)
{
if(!strcmp(nodeptr->word,word))//这个单词已经存在
{
nodeptr->num+=1;
headptr->num+=1;
if((nodeptr->num>nodeptr->pre->num)&&(nodeptr->pre!=headptr))//与之前的单词的数目进行对比,找到一个比其数目小的最大单词数目所属单词进行交换
{
struct node* searchptr;//新建一个指针用于查找
searchptr=nodeptr->pre->pre;
while((nodeptr->num>searchptr->num)&&(searchptr!=headptr))
searchptr=searchptr->pre;
searchptr=searchptr->next;//回退一个结点
float temp; //交换单词数目
temp=nodeptr->num; //变量的生存期?
nodeptr->num=searchptr->num;
searchptr->num=temp;
char tempword[WORDLEN]; //交换单词
strcpy(tempword,nodeptr->word);
strcpy(nodeptr->word,searchptr->word);
strcpy(searchptr->word,tempword);
}
break;//跳出第二个while循环
}
else
nodeptr=nodeptr->next;//继续搜索下一个单词
}
if(nodeptr!=headptr) //判断是否已经找到这个单词
return 0; //已经找到
else
{
struct node* lastptr=(struct node*)malloc(sizeof(struct node));//在链表的尾部新建一个结点存放该单词
strcpy(lastptr->word,word);
lastptr->num=1;
lastptr->pre=nodeptr->pre;
lastptr->next=headptr;
headptr->pre->next=lastptr;
headptr->pre=lastptr;
headptr->num+=1;
}
return 0;
}
int output(char* temppath)//输出链表的数据
{
struct node *tempnode;
tempnode=headptr->next;
FILE *target=fopen(temppath,"w");
if(!target)
{
printf("输入结果失败!\n");
return 0;
}
else
{
while(tempnode!=headptr)
{
fprintf(target,"%4.2f",(tempnode->num/headptr->num)*100);
fputs("%",target);
fprintf(target," %s",tempnode->word);
fputs("\n",target);
tempnode=tempnode->next;
}
end=clock();
fprintf(target,"the process time is:%.2f seconds\n",(double)(end-start)/(double)CLOCKS_PER_SEC);
return 0;
}
}
void searchAllFiles( char *filePath )
{
struct _finddata_t fileInfo;
char filePathCpy[PATH];
strcpy(filePathCpy, filePath);
int hfind = _findfirst(filePath, &fileInfo);
if( hfind == -1 ) //打开路径失败返回
{
return;
}
else
{
int tag=0;
while( tag != -1 ) //从第一个文件..第N个文件
{
if( strcmp( fileInfo.name, "." )==0 || strcmp( fileInfo.name, ".." )==0 ) //一点和两点分别是根目录和当前目录。
{
tag=_findnext( hfind, &fileInfo ); //查找下一个配匹的文件
continue;
}
//取得全路径
char fullPath[PATH];
strcpy( fullPath, filePathCpy ); //把上一个文夹的路径赋给当前的全部路径
fullPath[ strlen( fullPath ) - strlen( "*" ) ] = '\0';//去掉\\*
strcat( fullPath, fileInfo.name );
if ( fileInfo.attrib & _A_SUBDIR ) //非0,是一个文件夹
{
strcat( fullPath, "\\*" );
searchAllFiles( fullPath ); //递归扫描该文件夹的子文件
}
else
{
searchword(fullPath);
}
tag=_findnext(hfind,&fileInfo); //查找下一个配匹的文件
}
_findclose(hfind);
}
}
#include<STDIO.H>
#include<STRING.H>
#include<STDLIB.H>
#include<CONIO.H>
#define WORDLEN 50
#include <IO.H>
#include<TIME.H>
#define PATH 2000
struct node{
char word[WORDLEN];
float num;
struct node *pre,*next;
};
clock_t start,end;
struct node *headptr=(struct node*)malloc(sizeof(struct node));
int main(int argc,char *argv[])
{
int output(char*);
void searchAllFiles(char*);
strcpy(headptr->word," ");
headptr->num=0;//用来存储单词的总数
headptr->pre=headptr;
headptr->next=headptr;
char paths[PATH],patht[PATH];
// printf("请输入要查找的文件路径:\n");
//gets(paths);
if(argc<3)
{
printf("请输入程序名,搜索路径及结果文件路径:\n");
return 0;
}
strcpy(paths,argv[1]);
//printf("请输入要输出结果的文件路径:\n");
//gets(patht);
strcpy(patht,argv[2]);
start=clock();
searchAllFiles(paths);
output(patht);
return 0;
}
int searchword(char *fpath)
{
char filepath[PATH];
strcpy(filepath,fpath);
FILE *fptr;
fptr=fopen(filepath,"r");
char word[20]="";
char sword[2];
char ch;
int InsertAndSort(char*);
while((ch=fgetc(fptr))!=EOF)
{
if(!((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')||(ch=='?')||(ch=='!')||(ch=='-'))&&(!strcmp(word,"")))//所得到的字符不是要处理的字符且word数组为空
continue;
if((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')||(ch=='-'))//得到的字符是字母或连字符
{
sword[0]=tolower(ch);//将其变成小写
strncat(word,sword,1);
continue;
}
if(!((ch>='a'&&ch<='z')||(ch>='A'&&ch<='Z')||(ch=='-')||(ch=='?')||(ch=='!'))&&(strcmp(word,"")))//所得到的字符不是要处理的字符且word数组为非空
{
InsertAndSort(word);
strcpy(word,"");//重置word数组为空
continue;
}
if(((ch=='?')||(ch=='!'))&&(!strcmp(word,"")))//得到'!'或'?'且word数组为空
{
sword[0]=ch;
strncat(word,sword,1);
InsertAndSort(word);
strcpy(word,"");//重置word数组为空
continue;
}
if(((ch=='?')||(ch=='!'))&&(strcmp(word,"")))//得到'!'或'?'且word数组非空
{
InsertAndSort(word);//先处理word数组
strcpy(word,"");//重置word数组为空
sword[0]=ch;
strncat(word,sword,1);
InsertAndSort(word);
strcpy(word,"");//重置word数组为空
continue;
}
continue;
}
return 0;
}
int InsertAndSort(char* word)
{
struct node *nodeptr;//以下部分应该抽象成一个函数
nodeptr=headptr->next;
while(nodeptr!=headptr)
{
if(!strcmp(nodeptr->word,word))//这个单词已经存在
{
nodeptr->num+=1;
headptr->num+=1;
if((nodeptr->num>nodeptr->pre->num)&&(nodeptr->pre!=headptr))//与之前的单词的数目进行对比,找到一个比其数目小的最大单词数目所属单词进行交换
{
struct node* searchptr;//新建一个指针用于查找
searchptr=nodeptr->pre->pre;
while((nodeptr->num>searchptr->num)&&(searchptr!=headptr))
searchptr=searchptr->pre;
searchptr=searchptr->next;//回退一个结点
float temp; //交换单词数目
temp=nodeptr->num; //变量的生存期?
nodeptr->num=searchptr->num;
searchptr->num=temp;
char tempword[WORDLEN]; //交换单词
strcpy(tempword,nodeptr->word);
strcpy(nodeptr->word,searchptr->word);
strcpy(searchptr->word,tempword);
}
break;//跳出第二个while循环
}
else
nodeptr=nodeptr->next;//继续搜索下一个单词
}
if(nodeptr!=headptr) //判断是否已经找到这个单词
return 0; //已经找到
else
{
struct node* lastptr=(struct node*)malloc(sizeof(struct node));//在链表的尾部新建一个结点存放该单词
strcpy(lastptr->word,word);
lastptr->num=1;
lastptr->pre=nodeptr->pre;
lastptr->next=headptr;
headptr->pre->next=lastptr;
headptr->pre=lastptr;
headptr->num+=1;
}
return 0;
}
int output(char* temppath)//输出链表的数据
{
struct node *tempnode;
tempnode=headptr->next;
FILE *target=fopen(temppath,"w");
if(!target)
{
printf("输入结果失败!\n");
return 0;
}
else
{
while(tempnode!=headptr)
{
fprintf(target,"%4.2f",(tempnode->num/headptr->num)*100);
fputs("%",target);
fprintf(target," %s",tempnode->word);
fputs("\n",target);
tempnode=tempnode->next;
}
end=clock();
fprintf(target,"the process time is:%.2f seconds\n",(double)(end-start)/(double)CLOCKS_PER_SEC);
return 0;
}
}
void searchAllFiles( char *filePath )
{
struct _finddata_t fileInfo;
char filePathCpy[PATH];
strcpy(filePathCpy, filePath);
int hfind = _findfirst(filePath, &fileInfo);
if( hfind == -1 ) //打开路径失败返回
{
return;
}
else
{
int tag=0;
while( tag != -1 ) //从第一个文件..第N个文件
{
if( strcmp( fileInfo.name, "." )==0 || strcmp( fileInfo.name, ".." )==0 ) //一点和两点分别是根目录和当前目录。
{
tag=_findnext( hfind, &fileInfo ); //查找下一个配匹的文件
continue;
}
//取得全路径
char fullPath[PATH];
strcpy( fullPath, filePathCpy ); //把上一个文夹的路径赋给当前的全部路径
fullPath[ strlen( fullPath ) - strlen( "*" ) ] = '\0';//去掉\\*
strcat( fullPath, fileInfo.name );
if ( fileInfo.attrib & _A_SUBDIR ) //非0,是一个文件夹
{
strcat( fullPath, "\\*" );
searchAllFiles( fullPath ); //递归扫描该文件夹的子文件
}
else
{
searchword(fullPath);
}
tag=_findnext(hfind,&fileInfo); //查找下一个配匹的文件
}
_findclose(hfind);
}
}
2009-06-21
展开全部
语料库词频统计程序
老大让我写个统计程序,我就看看书写了两个.
这个是用c++的map方法,map内部实现是红黑树,应该效率比较高.
#include <map>
#include <string>
#include <iostream>
using namespace std;
typedef std::map<std::string, int> type_map;
typedef type_map::iterator type_iter;
type_map m;
type_iter it;
bool lookup(string s)//const char* s)
{
int ret = 0;
it = m.find(s);
if (m.end() != it)
return true;
else
return false;
}
void insert(string s )//const char* s)
{
int count=1;
m.insert(type_map::value_type(s, count));
}
bool gbr(char c,FILE *p)
{
if((int)c==13||(int)c==10||(int) c>=97&&(int) c<122||c=='['||c==']'||c=='{'||c==' '|(int)c==47||c==EOF)
{ if(c=='{')
do{
c=fgetc(p);
}while(c!='}');
return false;}//if
return true;
}
void display()
{
string str;
int num;
cout<<"count vocabulary:"<<endl;
for(it=m.begin();it!=m.end();it++)
{
str=it->first;
num=it->second;
cout<<num<<" "<<str<<endl;
}
}
int main(int argc, char* argv[])
{
FILE *fp;
char *s,c,temp[30],word[30];
fp=fopen("c:/tt.txt","r");
if(fp==NULL)
{
cout<<"file can not open";
return 0;
}
while((c=fgetc(fp))!=EOF)
{
s=temp;
while(gbr(c,fp)==true)
{
*s++=c;
c=fgetc(fp);
}
*s='\0';
string text(temp);
if(strlen(temp)!=0)//recognize a word
{
if(lookup(text)==false)
insert(text);
else
it->second=it->second+1;
}
}
display();
fclose(fp);
return 0;
}
这个是我看到<The c programming language>想起来的hash统计法
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define HASHSIZE 101
struct nlist{
struct nlist *next;
char name[30];
int count;
};
static struct nlist *hashtab[HASHSIZE];
unsigned hash(char*s)
{
unsigned hashval;
for(hashval=0;*s!='\0';s++)
hashval=*s+31*hashval;
return hashval%HASHSIZE;
}
struct nlist* lookup(char *s)
{
struct nlist *np;
for(np=hashtab[hash(s)];np!=NULL;np=np->next)
if(strcmp(s,np->name)==0)
return np;
//found
return NULL;
}
void install(char*name)
{
struct nlist *np,*nq;
unsigned hashval;
hashval=hash(name);
if(lookup(name)==NULL){//not found
np=(struct nlist*) malloc(sizeof(struct nlist));
strcpy(np->name,name);
np->count=1;
np->next=NULL;
nq=hashtab[hashval];
if(nq==NULL)
hashtab[hashval]=np;
else{
while(nq->next!=NULL)
nq=nq->next;
nq->next=np;
}//
}//end of if
else{
np=lookup(name);
np->count++;
}// found the key
}
void print_count()
{
struct nlist *np;
printf("count vocabulary:\n");
for(int i=0;i<HASHSIZE;i++)
{
np=hashtab[i];
while(np!=NULL)
{
printf("%d %s\n",np->count, np->name);
np=np->next;
}
}
}
bool gbr(char c,FILE *p)
{
if((int)c==13||(int)c==10||(int) c>=97&&(int) c<122||c=='['||c==']'||c=='{'||c==' '|(int)c==47||c==EOF)
{ if(c=='{')
do{
c=fgetc(p);
}while(c!='}');
return false;}//if
return true;
}
int main()
{
FILE *fp;
char *s,*t,c,temp[30],word[30];
fp=fopen("c:/paper.txt","r");
if(fp==NULL)
{
printf("file can not open");
// exit(0);
}
//if((c=fgetc(fp))!=EOF)
while((c=fgetc(fp))!=EOF)
{
s=temp;
//t=word;
while(gbr(c,fp)==true)
{
*s++=c;
c=fgetc(fp);
}
*s='\0';
if(strlen(temp)!=0)//recognize a word
install(temp);
}
fclose(fp);
print_count();
return 0;
}
老大让我写个统计程序,我就看看书写了两个.
这个是用c++的map方法,map内部实现是红黑树,应该效率比较高.
#include <map>
#include <string>
#include <iostream>
using namespace std;
typedef std::map<std::string, int> type_map;
typedef type_map::iterator type_iter;
type_map m;
type_iter it;
bool lookup(string s)//const char* s)
{
int ret = 0;
it = m.find(s);
if (m.end() != it)
return true;
else
return false;
}
void insert(string s )//const char* s)
{
int count=1;
m.insert(type_map::value_type(s, count));
}
bool gbr(char c,FILE *p)
{
if((int)c==13||(int)c==10||(int) c>=97&&(int) c<122||c=='['||c==']'||c=='{'||c==' '|(int)c==47||c==EOF)
{ if(c=='{')
do{
c=fgetc(p);
}while(c!='}');
return false;}//if
return true;
}
void display()
{
string str;
int num;
cout<<"count vocabulary:"<<endl;
for(it=m.begin();it!=m.end();it++)
{
str=it->first;
num=it->second;
cout<<num<<" "<<str<<endl;
}
}
int main(int argc, char* argv[])
{
FILE *fp;
char *s,c,temp[30],word[30];
fp=fopen("c:/tt.txt","r");
if(fp==NULL)
{
cout<<"file can not open";
return 0;
}
while((c=fgetc(fp))!=EOF)
{
s=temp;
while(gbr(c,fp)==true)
{
*s++=c;
c=fgetc(fp);
}
*s='\0';
string text(temp);
if(strlen(temp)!=0)//recognize a word
{
if(lookup(text)==false)
insert(text);
else
it->second=it->second+1;
}
}
display();
fclose(fp);
return 0;
}
这个是我看到<The c programming language>想起来的hash统计法
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define HASHSIZE 101
struct nlist{
struct nlist *next;
char name[30];
int count;
};
static struct nlist *hashtab[HASHSIZE];
unsigned hash(char*s)
{
unsigned hashval;
for(hashval=0;*s!='\0';s++)
hashval=*s+31*hashval;
return hashval%HASHSIZE;
}
struct nlist* lookup(char *s)
{
struct nlist *np;
for(np=hashtab[hash(s)];np!=NULL;np=np->next)
if(strcmp(s,np->name)==0)
return np;
//found
return NULL;
}
void install(char*name)
{
struct nlist *np,*nq;
unsigned hashval;
hashval=hash(name);
if(lookup(name)==NULL){//not found
np=(struct nlist*) malloc(sizeof(struct nlist));
strcpy(np->name,name);
np->count=1;
np->next=NULL;
nq=hashtab[hashval];
if(nq==NULL)
hashtab[hashval]=np;
else{
while(nq->next!=NULL)
nq=nq->next;
nq->next=np;
}//
}//end of if
else{
np=lookup(name);
np->count++;
}// found the key
}
void print_count()
{
struct nlist *np;
printf("count vocabulary:\n");
for(int i=0;i<HASHSIZE;i++)
{
np=hashtab[i];
while(np!=NULL)
{
printf("%d %s\n",np->count, np->name);
np=np->next;
}
}
}
bool gbr(char c,FILE *p)
{
if((int)c==13||(int)c==10||(int) c>=97&&(int) c<122||c=='['||c==']'||c=='{'||c==' '|(int)c==47||c==EOF)
{ if(c=='{')
do{
c=fgetc(p);
}while(c!='}');
return false;}//if
return true;
}
int main()
{
FILE *fp;
char *s,*t,c,temp[30],word[30];
fp=fopen("c:/paper.txt","r");
if(fp==NULL)
{
printf("file can not open");
// exit(0);
}
//if((c=fgetc(fp))!=EOF)
while((c=fgetc(fp))!=EOF)
{
s=temp;
//t=word;
while(gbr(c,fp)==true)
{
*s++=c;
c=fgetc(fp);
}
*s='\0';
if(strlen(temp)!=0)//recognize a word
install(temp);
}
fclose(fp);
print_count();
return 0;
}
参考资料: http://www.cnblogs.com/Tony-woo/archive/2007/11/13/958452.html
本回答被提问者采纳
已赞过
已踩过<
评论
收起
你对这个回答的评价是?
展开全部
唉。。大学生呀
已赞过
已踩过<
评论
收起
你对这个回答的评价是?
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询