从文件中读取字符串，并在 c 中解析 /0 字符和 utf8 符号-解网

问：

我想写一个刽子手游戏，从文本文件中的列表中随机选择一个单词。

如果我在每次初始化时设置一个单词，则该实现有效：

char wort[] =  "Apfelbäumchen";

如果我从文件的一行中读取一个字符串，它就不再起作用了。我得到一个字符串，它至少比单词长一个字符。我发现这是因为字符串结束字符。还有像德语，和这样的字符似乎是以其他方式编码的，所以比较会说字符串不相等。'\0'äüö

你能帮帮我吗？

#include <stdio.h>
#include <string.h>
#include <stdlib.h> // Wofür diese?
#include <time.h> //für den Zufall

#define MAX_WORD_LENGTH 50


char* WortAusDatei(const char* filename) {
    FILE* file = fopen(filename, "r");
    if (file == NULL){
        printf("Datei konnte nicht gefunden werden.\n");
        return NULL;
    }
    
    srand(time(0));
    int wordCount = 0;
    char word[MAX_WORD_LENGTH];
    char* selectedWord = NULL;
    
    while (fgets(word, MAX_WORD_LENGTH, file) != NULL) {
        if (rand() % ++wordCount == 0) {
            selectedWord = strdup(word);
            //selectedWord = strcpy(word);
        }
    }

    fclose(file);

    printf("Debug: Das Wort heißt %s", selectedWord);
    printf("Debug: , es ist %d Buchstaben lang. \n" , strlen(selectedWord));

    //char* wort = malloc(100 * sizeof(char)); //Speicher für das Wort reservieren
    //strcpy(wort, "Bratpfanne"); // Wort in den reservierten Speicher kopieren
    return selectedWord; // Rückgabe des Strings
}

int main(){
    const char* filename = "worte.txt";
    char* wort = WortAusDatei(filename);    
    //char wort[] = strcpy(WortAusDatei(filename));
    //char wort[] =  "Apfelbaum";
    printf ("%s", wort);
    //free(wort);
    //printf ("%s", wort);

char eingabe[100];
    int leben = 10;

    int zeichenAnzahl = strlen(wort)-1; // Minus one to NOT count the closing zero character

    int eraten[strlen(wort)];
    int i;
    
    //paintHangman;
            
    anfang:
    printf("\n");

    // Zeige die Platzhalter für jeden Buchstaben an
    for (i = 0; i < strlen(wort) - 1; i++){
        if (eraten[i] == 1){
            printf("%c ", wort[i]);
        } else {
            printf("_ ");
        }
    }
    
    printf("\n\n");

    scanf("%s", eingabe); 

    if (strcmp(eingabe, wort) == 0){
        printf("Gewonnen!\n");
        return 0;
    }

    // Wenn eingbabe ein Buchstabe / d.h. länge 1 dann
    // Prüfe ob der Buchtstabe im Wort enthalten ist.
    if (strlen(eingabe) == 1){
        int i;
        int trefferanzahl = 0;
        for (i = 0; wort[i] != '\0'; i++){
            if (wort[i] == eingabe[0]){
                trefferanzahl++;
                printf("Treffer\n");
                eraten[i] = 1;

                // Prüfe ob alle Buchstaben des Wortes eraten wurden
                int alleEraten = 1;
                for (int i = 0; i < strlen(wort); i++){
                    if (eraten[i] != 1){
                        alleEraten = 0;
                        break;
                    }
                }
                if (alleEraten){
                    printf("Gewonnen, das Lösungswort ist: %s\n", wort);
                    return 0;
                }
                
            }
        }
        if (trefferanzahl == 0){
            leben--;
            printf("Du hast nurnoch %d Leben\n", leben);
            if (leben == 0){
                printf("Verloren");
                return 0;
            }
        }

    } else {
        printf("Eingabe ungültig\n");
    }

    goto anfang;

    return 0;
}

我对字符串进行了研究，并尝试使用不同的方法来复制字符串，例如和。strdupstrcpy

我测试了如果我在每次初始化时传递单词而不是从文件中读取它，它就会起作用。

C 文件字符编码字符串比较

我已经看到额外的字符往往由负字节后跟正字节组成。这是因为整数以字节表示的方式。无符号字节将具有从 0 （00）到 255 （FF）的数字，但有符号整数使用最高值位作为 + 或 - 的标志。所以FF将是-128。十六进制值中所有额外的 F 都在那里，因为在 C 中，标准 int 使用 4（我认为）字节。char 变量只有一个字节。

2赞 Oka 7/3/2023 #2

首先，您可能希望在程序的早期使用 setlocale，否则您将在 C 语言环境中运行。这可以像

setlocale(LC_ALL, "");

它根据用户的首选项（环境）或类似的东西设置区域设置

setlocale(LC_ALL, "en_US.UTF-8"); /* or "de_DE.UTF-8", or whatnot */

显式设置。

在 UTF-8 中，（U+00E4）是一个多字节字符，由序列组成。ä0xC3 0xA4

演示这一点的程序：

#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(void)
{
    setlocale(LC_ALL, "en_US.UTF-8"); /* or an equally appropriate locale */

    char word[] = "Apfelbäumchen";
    int clen = 0;
    size_t offset = 0;

    while ((clen = mblen(word + offset, MB_CUR_MAX)) > 0) {
        printf("%.*s | UTF-8 MBS<%d>: ", clen, word + offset, clen);

        for (int i = 0; i < clen; i++)
            printf("0x%hhX ", word[offset + i]);

        printf("\n");
        offset += clen;
    }
}

输出：

A | UTF-8 MBS<1>: 0x41 
p | UTF-8 MBS<1>: 0x70 
f | UTF-8 MBS<1>: 0x66 
e | UTF-8 MBS<1>: 0x65 
l | UTF-8 MBS<1>: 0x6C 
b | UTF-8 MBS<1>: 0x62 
ä | UTF-8 MBS<2>: 0xC3 0xA4 
u | UTF-8 MBS<1>: 0x75 
m | UTF-8 MBS<1>: 0x6D 
c | UTF-8 MBS<1>: 0x63 
h | UTF-8 MBS<1>: 0x68 
e | UTF-8 MBS<1>: 0x65 
n | UTF-8 MBS<1>: 0x6E

以 null 结尾的多字节字符串（NTMBS）通常与“窄”字节字符串函数（例如，，）良好接口，但是在尝试处理 NTMBS 中的单个字符时，事情会变得很麻烦。putsstrcpy

因此，将计算字节数。strlen("ä")2

该额外的字节导致您尝试在此处修复的不一致：

int zeichenAnzahl = strlen(wort)-1; // Minus one to NOT count the closing zero character

(请注意，strlen 在其计数中不包括以 null 结尾的字节。)

如上所示，mblen（也考虑 mbrlen）可用于确定多字节字符的字节长度。可用于从字符串中打印最大字节数。printf("%.*s, length, start_of_sequence)

您需要更仔细地计算这些字符，以便可以根据输入准确地调整标志数组（）的大小和索引（在上面的示例中，这可能像在循环中一样简单）。eratencount++

此外，比较两个单独的多字节字符通常需要了解它们的字节长度（例如，使用 strncmp / memcmp，但 AFAIK 可能会因复杂的移位状态、混合编码或重复/等效而分崩离析）。

或者，可以将多字节字符和字符串转换为宽字符和字符串，或者可以使用宽字符支持来编写整个程序。例如，is 和 is 通常为真。您可能会发现这是最简单的路线。wcslen(L"ä")1L"äb"[0] == L"bä"[1]

(请注意，必须注意不要将对字节（“窄”）和宽字符 I/O 函数的调用与同一流混合在一起。

另请参阅：ICU。

当读取换行符并且缓冲区中有空间时，它会将换行符放在缓冲区中。这是字符串比预期长一个字节的另一个原因。fgets

相反，永远不会在缓冲区中放置换行符，因为在读取空格时终止。scanf("%s", eingabe)%s

(请注意，无界的 scanf（“%s”， ... 非常危险。例如，如果您有 char buf[128];，请使用 scanf（“%127s”， ... 限制输入，这为以 null 结尾的字节留出了空间。或者只是使用 fgets。)

您需要处理这些换行符以规范化您的输入（如从 fgets（）输入中删除尾随换行符）。

以下

if (rand() % ++wordCount == 0) {
    selectedWord = strdup(word);

随机泄漏先前调用提供的内存。strdup

这大致是水库采样的流版本（其中 k=1）。

为了避免不断分配和释放内存，这可以通过两个缓冲区 A 和 B 来完成，其中读取行逐个填充 B，并随机复制到 A（始终复制第一行读取）。或者，可以执行指针交换，而不是复制。

结果是 A 的内容，可以在函数末尾传递给 A。strdup

通常最好在程序开始时只调用一次。即使您当前只调用一次，这种情况也可能会改变，并且在同一秒内的多次调用将导致相同的随机序列。srandWortAusDateisrand(time(0))

避免使用创建循环 - 只需使用或。如果你发现一个循环的内部过于嵌套，把它的一部分重构为函数。gotowhilefor

这是一个粗略的例子，实现了我所讨论的大部分内容，并在某种程度上重构了你的程序。在编码（和移位状态）方面更有经验的人可能会在这方面戳一些漏洞，但对于一个小用例来说应该没问题（只是不要将其用作明确的参考）。

#include <locale.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define MAX_WORD_LENGTH 64

static bool every(bool *set, size_t length)
{
    for (size_t i = 0; i < length; i++)
        if (!set[i])
            return false;

    return true;
}

static char *get_default_word(const char *badf)
{
    static const char *words[] = {
        "error",
        "sea",
        "spell",
        "elegant",
        "zippy",
        "chalk",
        "sweet",
        "join",
        "superb"
    };

    fprintf(stderr,
            "WARN: Generating default word. Check validity of word file `%s`.\n", badf);

    char *def = strdup(words[rand() % (sizeof words / sizeof *words)]);

    if (!def) {
        perror("Could not generate default word. Something is quite wrong");
        exit(EXIT_FAILURE);
    }

    return def;
}

static char *word_from_file(const char *filename) {
    FILE *file = fopen(filename, "r");

    if (!file) {
        perror(filename);
        return NULL;
    }

    size_t lines = 0;
    char word[MAX_WORD_LENGTH] = { 0 };
    char temp[MAX_WORD_LENGTH] = { 0 };

    while (fgets(temp, sizeof temp, file))
        if ((rand() % ++lines) < 1)
            strcpy(word, temp);

    fclose(file);

    word[strcspn(word, "\n")] = '\0';

    return *word ? strdup(word) : NULL;
}

static size_t character_count(const char *mbs)
{
    size_t count = 0;

    for (; *mbs; mbs++)
        if ((*mbs & 0xc0) != 0x80)
            count++;

    return count;
}

static void display(const char *string, bool *flags, size_t length)
{
    int len = 0;
    size_t offset = 0;
    size_t index = 0;

    mblen(NULL, 0);

    while (index < length && (len = mblen(string + offset, MB_CUR_MAX)) > 0) {
        if (flags[index])
            printf("%.*s ", len, string + offset);
        else
            printf("_ ");

        offset += len;
        index++;
    }

    putchar('\n');
}

static size_t mark(const char *mbs, bool *flags, size_t length, const char *query)
{
    size_t matches = 0;
    size_t index = 0;
    size_t offset = 0;

    mblen(NULL, 0);
    int qlen = mblen(query, MB_CUR_MAX);

    mblen(NULL, 0);
    int slen = 0;

    while (index < length && (slen = mblen(mbs + offset, MB_CUR_MAX)) > 0) {
        if (qlen == slen && 0 == memcmp(mbs + offset, query, qlen)) {
            flags[index] = true;
            matches++;
        }

        offset += slen;
        index++;
    }

    return matches;
}

int main(int argc, char **argv)
{
    setlocale(LC_ALL, "");
    srand((unsigned) time(NULL));

    const char *filename = argc > 1 ? argv[1] : "words.txt";
    char *word = word_from_file(filename);

    if (!word)
        word = get_default_word(filename);

    fprintf(stderr, "DEBUG: <<%s>>\n", word);

    size_t length = character_count(word);
    unsigned lives = 10;
    bool guess[length];

    memset(guess, 0, sizeof guess);

    while (1) {
        display(word, guess, length);

        printf("Enter a guess: ");

        char input[128];

        if (!fgets(input, sizeof input, stdin))
            break;

        input[strcspn(input, "\n")] = '\0';

        size_t hits = mark(word, guess, length, input);

        if (hits) {
            puts("Hit!");
        } else {
            puts("Miss!");

            if (!--lives) {
                puts("Game over! You ran out of lives!");
                break;
            }
        }

        if (every(guess, length)) {
            puts("That's it! You won!");
            break;
        }

        printf("Not quite there! You have %u lives remaining.\n", lives);
    }

    printf("The word was: %s\n", word);
    free(word);
}

使用中：

DEBUG: <<Apfelbäumchen>>
_ _ _ _ _ _ _ _ _ _ _ _ _ 
Enter a guess: A
Hit!
Not quite there! You have 10 lives remaining.
...
...
A p f e l b _ u m c h e n 
Enter a guess: ä
Hit!
That's it! You won!
The word was: Apfelbäumchen

从文件中读取字符串，并在 c 中解析 /0 字符和 utf8 符号

Read strings out of a file and parse /0 character and utf8 signs right in c

评论

评论

评论