-
Notifications
You must be signed in to change notification settings - Fork 5
Why does � appear in output instead on non-letter characters? #132
Copy link
Copy link
Closed
Description
Hi @dimus I'm playing with gnfinder v1.0.0+ and am a bit puzzled. I'm regularly seeing � appear in the JSON output. I'm using gnfinder on a Mac with the flags --utf8-input --words-around 4, yet I'm getting �. I'm attaching an example text file, with the JSON I get from this command:
gnfinder --utf8-input --words-around 4 --format pretty text_0.txt
It seems that anything which isn't a letter is replaced, sometimes whole words. For example, years, periods, etc. all get replaced. Is this the intention? It means that useful information about the context of the name is lost.
{
"metadata": {
"documentation": "",
"date": "2022-11-03T16:17:47.499171Z",
"gnfinderVersion": "v1.0.0+",
"inputFile": "text_0.txt",
"textExtractSec": 0.00004,
"nameFindingSec": 0.001241,
"totalSec": 0.001281,
"wordsAround": 4,
"language": "eng",
"withBayes": true,
"totalWords": 582,
"totalNameCandidates": 200,
"totalNames": 39
},
"names": [
{
"cardinality": 1,
"verbatim": "Microschismus",
"name": "Microschismus",
"oddsLog10": 5.871036092226226,
"start": 221,
"end": 234,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"moths",
"of",
"the",
"genus"
],
"wordsAfter": [
"Fletcher",
"�",
"Lepidoptera",
"Alucitidae"
]
},
{
"cardinality": 1,
"verbatim": "(Lepidoptera:",
"name": "Lepidoptera",
"oddsLog10": 5.112446580930601,
"start": 250,
"end": 263,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"genus",
"Microschismus",
"Fletcher",
"�"
],
"wordsAfter": [
"Alucitidae",
"from",
"the",
"Republic"
]
},
{
"cardinality": 1,
"verbatim": "Alucitidae)",
"name": "Alucitidae",
"oddsLog10": 4.764456240418255,
"start": 264,
"end": 275,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"Microschismus",
"Fletcher",
"�",
"Lepidoptera"
],
"wordsAfter": [
"from",
"the",
"Republic",
"of"
]
},
{
"cardinality": 1,
"verbatim": "Microschismus",
"name": "Microschismus",
"oddsLog10": 5.871036092226226,
"start": 489,
"end": 502,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"moths",
"of",
"the",
"genus"
],
"wordsAfter": [
"Fletcher",
"�",
"Lepidoptera",
"Alucitidae"
]
},
{
"cardinality": 1,
"verbatim": "(Lepidoptera:",
"name": "Lepidoptera",
"oddsLog10": 5.112446580930601,
"start": 518,
"end": 531,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"genus",
"Microschismus",
"Fletcher",
"�"
],
"wordsAfter": [
"Alucitidae",
"from",
"the",
"Republic"
]
},
{
"cardinality": 1,
"verbatim": "Alucitidae)",
"name": "Alucitidae",
"oddsLog10": 4.764456240418255,
"start": 532,
"end": 543,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"Microschismus",
"Fletcher",
"�",
"Lepidoptera"
],
"wordsAfter": [
"from",
"the",
"Republic",
"of"
]
},
{
"cardinality": 1,
"verbatim": "Microschismus",
"name": "Microschismus",
"oddsLog10": 5.871036092226226,
"start": 663,
"end": 676,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"äàííûå",
"ïî",
"âååðîêðûëêàì",
"ðîäà"
],
"wordsAfter": [
"Fletcher",
"�",
"Lepidoptera",
"Alucitidae"
]
},
{
"cardinality": 1,
"verbatim": "(Lepidoptera:",
"name": "Lepidoptera",
"oddsLog10": 5.112446580930601,
"start": 692,
"end": 705,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"ðîäà",
"Microschismus",
"Fletcher",
"�"
],
"wordsAfter": [
"Alucitidae",
"Þæíîé",
"Àôðèêè",
"Petr"
]
},
{
"cardinality": 1,
"verbatim": "Alucitidae)",
"name": "Alucitidae",
"oddsLog10": 4.764456240418255,
"start": 706,
"end": 717,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"Microschismus",
"Fletcher",
"�",
"Lepidoptera"
],
"wordsAfter": [
"Þæíîé",
"Àôðèêè",
"Petr",
"Ustjuzhanin"
]
},
{
"cardinality": 1,
"verbatim": "Alucitidae,",
"name": "Alucitidae",
"oddsLog10": 4.764456240418255,
"start": 1686,
"end": 1697,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"author",
"Key",
"Words"
],
"wordsAfter": [
"Microschismus",
"many-plumed",
"moths",
"Republic"
]
},
{
"cardinality": 1,
"verbatim": "Microschismus,",
"name": "Microschismus",
"oddsLog10": 5.871036092226226,
"start": 1698,
"end": 1712,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"author",
"Key",
"Words",
"Alucitidae"
],
"wordsAfter": [
"many-plumed",
"moths",
"Republic",
"of"
]
},
{
"cardinality": 1,
"verbatim": "Alucitidae,",
"name": "Alucitidae",
"oddsLog10": 4.764456240418255,
"start": 1797,
"end": 1808,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"new",
"data",
"Ключевые",
"Слова"
],
"wordsAfter": [
"Microschismus",
"веерокрылки",
"Республика"
]
},
{
"cardinality": 1,
"verbatim": "Microschismus,",
"name": "Microschismus",
"oddsLog10": 5.871036092226226,
"start": 1809,
"end": 1823,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"data",
"Ключевые",
"Слова",
"Alucitidae"
],
"wordsAfter": [
"веерокрылки",
"Республика",
"новые"
]
},
{
"cardinality": 1,
"verbatim": "Microschismus",
"name": "Microschismus",
"oddsLog10": 5.871036092226226,
"start": 1934,
"end": 1947,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"The",
"many-plumed",
"moths",
"genus"
],
"wordsAfter": [
"Fletcher",
"�",
"Type",
"species"
]
},
{
"cardinality": 2,
"verbatim": "Microschismus antenna-tus",
"name": "Microschismus antennatus",
"oddsLog10": 14.029741754963005,
"start": 1977,
"end": 2003,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"Fletcher",
"�",
"Type",
"species"
],
"wordsAfter": [
"Fletcher",
"�",
"by",
"monotypy"
]
},
{
"cardinality": 2,
"verbatim": "Microschismuspremnias",
"name": "Microschismus premnias",
"oddsLog10": 11.79380538551427,
"start": 2465,
"end": 2487,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"problem",
"of",
"the",
"species"
],
"wordsAfter": [
"Meyrick",
"�",
"In",
"the"
]
},
{
"cardinality": 1,
"verbatim": "Microschismus",
"name": "Microschismus",
"oddsLog10": 5.871036092226226,
"start": 2532,
"end": 2545,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"revision",
"of",
"the",
"genus"
],
"wordsAfter": [
"Ustjuzhanin",
"Kovtunovich",
"�",
"there"
]
},
{
"cardinality": 2,
"verbatim": "M. premnias:",
"name": "M. premnias",
"oddsLog10": 5.521658288032403,
"start": 2612,
"end": 2624,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"shown",
"the",
"syntypes",
"of"
],
"wordsAfter": [
"photo",
"of",
"the",
"adult"
]
},
{
"cardinality": 2,
"verbatim": "M. premnias",
"name": "M. premnias",
"oddsLog10": 5.521658288032403,
"start": 2706,
"end": 2717,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"genitalia",
"Edward",
"Meyrick",
"described"
],
"wordsAfter": [
"on",
"the",
"series",
"of"
]
},
{
"cardinality": 2,
"verbatim": "M. premnias",
"name": "M. premnias",
"oddsLog10": 5.521658288032403,
"start": 2969,
"end": 2980,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"As",
"the",
"lectotype",
"of"
],
"wordsAfter": [
"we",
"allocate",
"the",
"male"
]
},
{
"cardinality": 2,
"verbatim": "M. premnias",
"name": "M. premnias",
"oddsLog10": 5.521658288032403,
"start": 3041,
"end": 3052,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"habitat",
"of",
"the",
"species"
],
"wordsAfter": [
"is",
"Kwazulu-natal",
"Pinetown",
"The"
]
},
{
"cardinality": 2,
"verbatim": "Microschis-mus yakovlevi",
"name": "Microschismus yakovlevi",
"oddsLog10": 12.1087867733666,
"start": 3206,
"end": 3231,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"moths",
"from",
"South",
"Africa"
],
"wordsAfter": [
"Ustjuzhanin",
"et",
"Kovtunovich",
"sp�n"
]
},
{
"cardinality": 2,
"verbatim": "Microschismus alida",
"name": "Microschismus alida",
"oddsLog10": 12.189056121190019,
"start": 3269,
"end": 3288,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"et",
"Kovtunovich",
"sp�n",
"and"
],
"wordsAfter": [
"Ustjuzhanin",
"et",
"Kovtunovich",
"sp�n"
]
},
{
"cardinality": 2,
"verbatim": "Microschis-mus premnias",
"name": "Microschismus premnias",
"oddsLog10": 11.79380538551427,
"start": 3420,
"end": 3444,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"of",
"two",
"syntypes",
"of"
],
"wordsAfter": [
"Meyrick",
"�",
"where",
"the"
]
},
{
"cardinality": 2,
"verbatim": "Microschismusantennatus",
"name": "Microschismus antennatus",
"oddsLog10": 14.029741754963005,
"start": 3519,
"end": 3543,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"be",
"the",
"previously",
"described"
],
"wordsAfter": [
"Fletcher",
"�",
"The",
"image"
]
},
{
"cardinality": 3,
"verbatim": "M. premniaslectotype",
"name": "M. premnias lectotype",
"oddsLog10": 5.806582741178259,
"start": 3573,
"end": 3594,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"�",
"The",
"image",
"of"
],
"wordsAfter": [
"adult",
"is",
"given",
"for"
]
},
{
"cardinality": 2,
"verbatim": "Microschismus yakovlevi",
"name": "Microschismus yakovlevi",
"oddsLog10": 12.1087867733666,
"start": 3845,
"end": 3868,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"веерокрылок",
"из",
"Южной",
"Африки"
],
"wordsAfter": [
"Ustjuzhanin",
"et",
"Kovtunovich",
"sp�n"
]
},
{
"cardinality": 2,
"verbatim": "Microschismus alida",
"name": "Microschismus alida",
"oddsLog10": 12.189056121190019,
"start": 3904,
"end": 3923,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"et",
"Kovtunovich",
"sp�n",
"и"
],
"wordsAfter": [
"Ustjuzhanin",
"et",
"Kovtunovich",
"sp�n"
]
},
{
"cardinality": 2,
"verbatim": "Microschismus premnias",
"name": "Microschismus premnias",
"oddsLog10": 11.79380538551427,
"start": 3984,
"end": 4006,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"sp�n",
"Из",
"гетерогенных",
"син-типов"
],
"wordsAfter": [
"Meyrick",
"�",
"выделя-ются",
"лектотип"
]
},
{
"cardinality": 2,
"verbatim": "Microschismus antennatus",
"name": "Microschismus antennatus",
"oddsLog10": 14.029741754963005,
"start": 4093,
"end": 4117,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"второй",
"оказался",
"опи-санным",
"ранее"
],
"wordsAfter": [
"Fletcher",
"�",
"Впервые",
"приводится"
]
},
{
"cardinality": 2,
"verbatim": "M. premnias,",
"name": "M. premnias",
"oddsLog10": 5.521658288032403,
"start": 4181,
"end": 4193,
"annotationNomenType": "NO_ANNOT",
"wordsBefore": [
"приводится",
"изображение",
"имаго",
"лектотипа"
],
"wordsAfter": [
"а",
"также",
"описание",
"и"
]
}
]
}
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels