查找两个字符串之间的常用词-解网

问：

如何比较和查找两个标记化单词字符串之间的常用单词？

请帮我找到新的解决方案。

#include <stdio.h>
#include <string.h>
#include <ctype.h>

#define MAX 25

void findSame(char *str1, char *str2) {
    char *word1 = strtok(str1, " ");
    
    while (word1 != NULL) {
        char *word2 = strtok(str2, " ");
        
        while (word2 != NULL) {
            if (strcmp(word1, word2) == 0) {
                printf("%s ", word1);
                break;
            }
            word2 = strtok(NULL, " ");
        }
        word1 = strtok(NULL, " ");
    }
}

int main() {

    char str1[MAX], str2[MAX];
    int i;

    printf("Enter String 1: ");
    gets(str1);
    
    printf("Enter String 2: ");
    gets(str2);
    
    strlwr(str1); //convert strings to lowercase
    strlwr(str2); //convert strings to lowercase
    
    printf("\nOutput:\n");
    findSame(str1, str2);
    
    return 0;
}

程序 putput 应如下所示：

Enter String 1: The quick Brown FOX jumped over the LAzy dog
Enter String 2: Hey Quick brown cat and dog
quick brown dog

c 字符串函数 while-loop char

除非你想暴力破解它（两个循环），或者构建一个容器（哈希表、搜索树或某种集合），否则你可以用轻松做到这一点。只需将每个单词标记成两个数组，对每个数组进行排序，然后串联执行它们（很像合并排序中的合并操作）。您可以在线性时间内检测沿途的重复项，因此总体复杂性实际上由通常为 O（N.log（N））的排序控制。您可能会得到不同的输出顺序。如果顺序很重要，请再执行一次传递，使用原始顺序对排序的重复列表进行二元搜索。qsort

0赞 chux - Reinstate Monica 11/7/2023

@helloworld123如果一个单词在两个字符串中都出现了两次，那么输出应该包含该单词的 1 个还是 2 个？

答：

2赞 chux - Reinstate Monica 11/7/2023 #1

代码存在以下问题：

缓冲区溢出

"The quick Brown FOX jumped over the LAzy dog"至少需要 46 个字节才能存储为字符串，大小为 25。str1[MAX]

使用删除。自 2011 年以来，它不再是标准 C 库的一部分。gets()
使用并剥离尾部。fgets()'\n'
使用更宽的缓冲区。
测试用户是否尝试输入大量输入。

// Candidate helper function
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// Candidate helper function
char *read_line(const char *prompt, size_t sz, char dest[sz]) {
  if (prompt) {
    fputs(prompt, stdout);
  }
  if (fgets(dest, (int) sz, stdin) ==  NULL) {
    perror("No input");
    exit (EXIT_FAILURE);
  }
  dest[strcspn(dest, "\n")] = '\0';  // Lop off potential trailing \n
  if (strlen(dest) + 1 >= sz) {
    perror("input too long");
    exit (EXIT_FAILURE);
  }
  return dest;
}
// To do: amend to flush input and re-prompt when input too long.


...
    // Sample usage

    //char str1[MAX], str2[MAX];
    //printf("Enter String 1: ");
    //gets(str1);
    //printf("Enter String 2: ");
    //gets(str2);

    #define LINE_MAX 100
    char str1[LINE_MAX + 1 + 1]; // 1 more for too long an input test, 1 more for \0
    read_line("Enter String 1: ", sizeof str1, str1);
    char str2[LINE_MAX + 1 + 1];
    read_line("Enter String 2: ", sizeof str2, str2);

修复“word”比较循环

把它留给其他人（或以后）。

永远不要使用 --it 是不安全的，因为它会在不知不觉中溢出缓冲区。始终改用。请参阅：为什么 gets 函数如此危险，以至于不应该使用它？getsfgets
MAX太小，无法包含示例输入字符串。将溢出缓冲区，导致 UB（未定义行为）。gets
在中，在第一次循环迭代 for 之后，内部循环 for 已经拆分了标记。因此，在第二次迭代中，将只看到 的第一个单词/标记。findSamestr1str2word2str2
我们必须拆分 [tokenize] 字符串并将其保存到数组中。然后，我们遍历这些字符串列表。char *

这是更正后的代码。它被注释：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <termios.h>

// NOTE/BUG: this is much too small for the input strings
#if 0
#define MAX 25
#else
#define MAX 250
#endif

// splitup -- split up a buffer into tokens
void
splitup(char **list,char *buf)
{
    size_t count = 0;

    // NOTE: this _is_ destructive of the original string

    char *cp = strtok(buf," ");

    while (cp != NULL) {
        if (count >= (MAX - 1))
            exit(2);

        list[count++] = cp;

        cp = strtok(NULL," ");
    }

    list[count] = NULL;
}

void
findSame(const char *str1, const char *str2)
{

    // preserve original string
    char tmp1[MAX];
    strcpy(tmp1,str1);
    // split into tokens
    char *list1[MAX];
    splitup(list1,tmp1);

    // preserve original string
    char tmp2[MAX];
    strcpy(tmp2,str2);
    // split into tokens
    char *list2[MAX];
    splitup(list2,tmp2);

    for (char **word1 = list1;  *word1 != NULL;  ++word1) {
        for (char **word2 = list2;  *word2 != NULL;  ++word2) {
            if (strcmp(*word1, *word2) == 0) {
                printf("%s ", *word1);
                break;
            }
        }
    }

    printf("\n");
}

void
prompt(char *buf,size_t size,const char *str)
{

    printf("%s: ",str);
    fflush(stdout);

    // safe replacement for gets
    char *cp = fgets(buf,size,stdin);
    if (cp == NULL)
        exit(1);

    // just a nicety for debugging ...
    // pretty print if stdin redirected from file
    struct termios tio;
    if (tcgetattr(fileno(stdin),&tio) < 0)
        fputs(buf,stdout);

    // strip the newline
    cp = strchr(buf,'\n');
    if (cp != NULL)
        *cp = 0;

    // lowercase the string
    for (;  *buf != 0;  ++buf)
        *buf = tolower((unsigned char) *buf);
}

int
main(void)
{

    char str1[MAX], str2[MAX];

    prompt(str1,sizeof(str1),"Enter string 1");
    prompt(str2,sizeof(str2),"Enter string 2");

    printf("\nOutput:\n");
    findSame(str1, str2);

    return 0;
}

以下是实际的程序输出：

Enter string 1: The quick Brown FOX jumped over the LAzy dog
Enter string 2: Hey Quick brown cat and dog

Output:
quick brown dog

0赞 Fe2O3 11/7/2023 #3

关于不用于输入，以及缓冲区太小，很容易溢出，有很好的反馈。下面的代码绕过了这些问题，但使用两个“编译时”字符串来获得所需的逻辑。接受用户输入（with ）留待另一天。gets()fgets()

下面，每个字符串被单独标记化，形成两个指向每个标记的指针数组。然后，一个普通循环（非最佳）调用，对两个集合中的令牌执行不区分大小写的比较。找到匹配项后，会将其报告给 stdout'。stricmp()

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char **tokenise( char *str, const char *dlm, int *n ) {
    char *cp, *lw = str, **arr = NULL;
    for( int i = 0, stp = 2; stp--; i = 0 ) {
        for( cp = str; (cp = strtok(cp, dlm)) != NULL; cp = NULL, i++ )
            if( stp ) lw = cp; else arr[ i ] = cp;
        if( stp ) {
            while( lw > str ) if( !*--lw ) *lw = *dlm;
            arr = calloc( (*n = i) + 1, sizeof *arr );
        }
    }
    return arr;
}

int main( void ) {
    char *dlm = ",.! \t\n";

    char buf1[] = "dog,  CAT,  horse. APPLE!! Banana Grape cat  banana";
    puts( buf1 );
    int cnt1;
    char **arr1 = tokenise( buf1, dlm, &cnt1 );

    char buf2[] = "cat, banana, orange, grape, apple, box";
    puts( buf2 );
    int cnt2;
    char **arr2 = tokenise( buf2, dlm, &cnt2 );

    char *sep = "";
    for( int i = 0; i < cnt2; i++)
        for( int j = 0; j < cnt1; j++)
            if( stricmp( arr2[i], arr1[j] ) == 0 ) {
                printf( "%s%s", sep, arr1[j] );
                sep = ", ";
                break;
            }
    putchar( '\n' );

    free( arr2 );
    free( arr1 );

    return 0;
}

输出：

dog,  CAT,  horse. APPLE!! Banana Grape cat  banana
cat, banana, orange, grape, apple, box
CAT, Banana, Grape, APPLE

上一个：如何将特定的引号替换为撇号？

下一个：字符数组应该如何用作字符串？

查找两个字符串之间的常用词

Find common words between two strings

评论

评论