Skip to content

Why does � appear in output instead on non-letter characters? #132

@rdmpage

Description

@rdmpage

Hi @dimus I'm playing with gnfinder v1.0.0+ and am a bit puzzled. I'm regularly seeing � appear in the JSON output. I'm using gnfinder on a Mac with the flags --utf8-input --words-around 4, yet I'm getting �. I'm attaching an example text file, with the JSON I get from this command:

gnfinder --utf8-input --words-around 4 --format pretty text_0.txt

It seems that anything which isn't a letter is replaced, sometimes whole words. For example, years, periods, etc. all get replaced. Is this the intention? It means that useful information about the context of the name is lost.

text_0.txt

{
  "metadata": {
    "documentation": "",
    "date": "2022-11-03T16:17:47.499171Z",
    "gnfinderVersion": "v1.0.0+",
    "inputFile": "text_0.txt",
    "textExtractSec": 0.00004,
    "nameFindingSec": 0.001241,
    "totalSec": 0.001281,
    "wordsAround": 4,
    "language": "eng",
    "withBayes": true,
    "totalWords": 582,
    "totalNameCandidates": 200,
    "totalNames": 39
  },
  "names": [
    {
      "cardinality": 1,
      "verbatim": "Microschismus",
      "name": "Microschismus",
      "oddsLog10": 5.871036092226226,
      "start": 221,
      "end": 234,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "moths",
        "of",
        "the",
        "genus"
      ],
      "wordsAfter": [
        "Fletcher",
        "�",
        "Lepidoptera",
        "Alucitidae"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "(Lepidoptera:",
      "name": "Lepidoptera",
      "oddsLog10": 5.112446580930601,
      "start": 250,
      "end": 263,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "genus",
        "Microschismus",
        "Fletcher",
        "�"
      ],
      "wordsAfter": [
        "Alucitidae",
        "from",
        "the",
        "Republic"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Alucitidae)",
      "name": "Alucitidae",
      "oddsLog10": 4.764456240418255,
      "start": 264,
      "end": 275,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "Microschismus",
        "Fletcher",
        "�",
        "Lepidoptera"
      ],
      "wordsAfter": [
        "from",
        "the",
        "Republic",
        "of"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Microschismus",
      "name": "Microschismus",
      "oddsLog10": 5.871036092226226,
      "start": 489,
      "end": 502,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "moths",
        "of",
        "the",
        "genus"
      ],
      "wordsAfter": [
        "Fletcher",
        "�",
        "Lepidoptera",
        "Alucitidae"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "(Lepidoptera:",
      "name": "Lepidoptera",
      "oddsLog10": 5.112446580930601,
      "start": 518,
      "end": 531,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "genus",
        "Microschismus",
        "Fletcher",
        "�"
      ],
      "wordsAfter": [
        "Alucitidae",
        "from",
        "the",
        "Republic"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Alucitidae)",
      "name": "Alucitidae",
      "oddsLog10": 4.764456240418255,
      "start": 532,
      "end": 543,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "Microschismus",
        "Fletcher",
        "�",
        "Lepidoptera"
      ],
      "wordsAfter": [
        "from",
        "the",
        "Republic",
        "of"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Microschismus",
      "name": "Microschismus",
      "oddsLog10": 5.871036092226226,
      "start": 663,
      "end": 676,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "äàííûå",
        "ïî",
        "âååðîêðûëêàì",
        "ðîäà"
      ],
      "wordsAfter": [
        "Fletcher",
        "�",
        "Lepidoptera",
        "Alucitidae"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "(Lepidoptera:",
      "name": "Lepidoptera",
      "oddsLog10": 5.112446580930601,
      "start": 692,
      "end": 705,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "ðîäà",
        "Microschismus",
        "Fletcher",
        "�"
      ],
      "wordsAfter": [
        "Alucitidae",
        "Þæíîé",
        "Àôðèêè",
        "Petr"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Alucitidae)",
      "name": "Alucitidae",
      "oddsLog10": 4.764456240418255,
      "start": 706,
      "end": 717,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "Microschismus",
        "Fletcher",
        "�",
        "Lepidoptera"
      ],
      "wordsAfter": [
        "Þæíîé",
        "Àôðèêè",
        "Petr",
        "Ustjuzhanin"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Alucitidae,",
      "name": "Alucitidae",
      "oddsLog10": 4.764456240418255,
      "start": 1686,
      "end": 1697,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "author",
        "Key",
        "Words"
      ],
      "wordsAfter": [
        "Microschismus",
        "many-plumed",
        "moths",
        "Republic"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Microschismus,",
      "name": "Microschismus",
      "oddsLog10": 5.871036092226226,
      "start": 1698,
      "end": 1712,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "author",
        "Key",
        "Words",
        "Alucitidae"
      ],
      "wordsAfter": [
        "many-plumed",
        "moths",
        "Republic",
        "of"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Alucitidae,",
      "name": "Alucitidae",
      "oddsLog10": 4.764456240418255,
      "start": 1797,
      "end": 1808,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "new",
        "data",
        "Ключевые",
        "Слова"
      ],
      "wordsAfter": [
        "Microschismus",
        "веерокрылки",
        "Республика"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Microschismus,",
      "name": "Microschismus",
      "oddsLog10": 5.871036092226226,
      "start": 1809,
      "end": 1823,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "data",
        "Ключевые",
        "Слова",
        "Alucitidae"
      ],
      "wordsAfter": [
        "веерокрылки",
        "Республика",
        "новые"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Microschismus",
      "name": "Microschismus",
      "oddsLog10": 5.871036092226226,
      "start": 1934,
      "end": 1947,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "The",
        "many-plumed",
        "moths",
        "genus"
      ],
      "wordsAfter": [
        "Fletcher",
        "�",
        "Type",
        "species"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "Microschismus antenna-␤tus",
      "name": "Microschismus antennatus",
      "oddsLog10": 14.029741754963005,
      "start": 1977,
      "end": 2003,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "Fletcher",
        "�",
        "Type",
        "species"
      ],
      "wordsAfter": [
        "Fletcher",
        "�",
        "by",
        "monotypy"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "Microschismus␤premnias",
      "name": "Microschismus premnias",
      "oddsLog10": 11.79380538551427,
      "start": 2465,
      "end": 2487,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "problem",
        "of",
        "the",
        "species"
      ],
      "wordsAfter": [
        "Meyrick",
        "�",
        "In",
        "the"
      ]
    },
    {
      "cardinality": 1,
      "verbatim": "Microschismus",
      "name": "Microschismus",
      "oddsLog10": 5.871036092226226,
      "start": 2532,
      "end": 2545,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "revision",
        "of",
        "the",
        "genus"
      ],
      "wordsAfter": [
        "Ustjuzhanin",
        "Kovtunovich",
        "�",
        "there"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "M. premnias:",
      "name": "M. premnias",
      "oddsLog10": 5.521658288032403,
      "start": 2612,
      "end": 2624,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "shown",
        "the",
        "syntypes",
        "of"
      ],
      "wordsAfter": [
        "photo",
        "of",
        "the",
        "adult"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "M. premnias",
      "name": "M. premnias",
      "oddsLog10": 5.521658288032403,
      "start": 2706,
      "end": 2717,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "genitalia",
        "Edward",
        "Meyrick",
        "described"
      ],
      "wordsAfter": [
        "on",
        "the",
        "series",
        "of"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "M. premnias",
      "name": "M. premnias",
      "oddsLog10": 5.521658288032403,
      "start": 2969,
      "end": 2980,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "As",
        "the",
        "lectotype",
        "of"
      ],
      "wordsAfter": [
        "we",
        "allocate",
        "the",
        "male"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "M. premnias",
      "name": "M. premnias",
      "oddsLog10": 5.521658288032403,
      "start": 3041,
      "end": 3052,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "habitat",
        "of",
        "the",
        "species"
      ],
      "wordsAfter": [
        "is",
        "Kwazulu-natal",
        "Pinetown",
        "The"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "Microschis-␤mus yakovlevi",
      "name": "Microschismus yakovlevi",
      "oddsLog10": 12.1087867733666,
      "start": 3206,
      "end": 3231,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "moths",
        "from",
        "South",
        "Africa"
      ],
      "wordsAfter": [
        "Ustjuzhanin",
        "et",
        "Kovtunovich",
        "sp�n"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "Microschismus alida",
      "name": "Microschismus alida",
      "oddsLog10": 12.189056121190019,
      "start": 3269,
      "end": 3288,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "et",
        "Kovtunovich",
        "sp�n",
        "and"
      ],
      "wordsAfter": [
        "Ustjuzhanin",
        "et",
        "Kovtunovich",
        "sp�n"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "Microschis-␤mus premnias",
      "name": "Microschismus premnias",
      "oddsLog10": 11.79380538551427,
      "start": 3420,
      "end": 3444,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "of",
        "two",
        "syntypes",
        "of"
      ],
      "wordsAfter": [
        "Meyrick",
        "�",
        "where",
        "the"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "Microschismus␤antennatus",
      "name": "Microschismus antennatus",
      "oddsLog10": 14.029741754963005,
      "start": 3519,
      "end": 3543,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "be",
        "the",
        "previously",
        "described"
      ],
      "wordsAfter": [
        "Fletcher",
        "�",
        "The",
        "image"
      ]
    },
    {
      "cardinality": 3,
      "verbatim": "M. premnias␤lectotype",
      "name": "M. premnias lectotype",
      "oddsLog10": 5.806582741178259,
      "start": 3573,
      "end": 3594,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "�",
        "The",
        "image",
        "of"
      ],
      "wordsAfter": [
        "adult",
        "is",
        "given",
        "for"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "Microschismus yakovlevi",
      "name": "Microschismus yakovlevi",
      "oddsLog10": 12.1087867733666,
      "start": 3845,
      "end": 3868,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "веерокрылок",
        "из",
        "Южной",
        "Африки"
      ],
      "wordsAfter": [
        "Ustjuzhanin",
        "et",
        "Kovtunovich",
        "sp�n"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "Microschismus alida",
      "name": "Microschismus alida",
      "oddsLog10": 12.189056121190019,
      "start": 3904,
      "end": 3923,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "et",
        "Kovtunovich",
        "sp�n",
        "и"
      ],
      "wordsAfter": [
        "Ustjuzhanin",
        "et",
        "Kovtunovich",
        "sp�n"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "Microschismus premnias",
      "name": "Microschismus premnias",
      "oddsLog10": 11.79380538551427,
      "start": 3984,
      "end": 4006,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "sp�n",
        "Из",
        "гетерогенных",
        "син-типов"
      ],
      "wordsAfter": [
        "Meyrick",
        "�",
        "выделя-ются",
        "лектотип"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "Microschismus antennatus",
      "name": "Microschismus antennatus",
      "oddsLog10": 14.029741754963005,
      "start": 4093,
      "end": 4117,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "второй",
        "оказался",
        "опи-санным",
        "ранее"
      ],
      "wordsAfter": [
        "Fletcher",
        "�",
        "Впервые",
        "приводится"
      ]
    },
    {
      "cardinality": 2,
      "verbatim": "M. premnias,",
      "name": "M. premnias",
      "oddsLog10": 5.521658288032403,
      "start": 4181,
      "end": 4193,
      "annotationNomenType": "NO_ANNOT",
      "wordsBefore": [
        "приводится",
        "изображение",
        "имаго",
        "лектотипа"
      ],
      "wordsAfter": [
        "а",
        "также",
        "описание",
        "и"
      ]
    }
  ]
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions