<?xml version='1.0' encoding='utf-8'?>
<?xml-stylesheet type="text/xsl" href="/v2/static/oai2.xsl"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
  <responseDate>2026-05-16T14:36:34Z</responseDate>
  <request identifier="oai:figshare.com:article/27108334" metadataPrefix="oai_datacite" verb="GetRecord">https://api.figshare.com/v2/oai</request>
  <GetRecord>
    <record>
      <header>
        <identifier>oai:figshare.com:article/27108334</identifier>
        <datestamp>2024-09-27T11:58:55Z</datestamp>
        <setSpec>category_28849</setSpec>
        <setSpec>category_29134</setSpec>
        <setSpec>category_29137</setSpec>
        <setSpec>category_27328</setSpec>
        <setSpec>portal_549</setSpec>
        <setSpec>item_type_3</setSpec>
        <setSpec>month_year_09_2024</setSpec>
      </header>
      <metadata>
        <resource xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.3/metadata.xsd">
          <identifier identifierType="DOI">10.5522/04/27108334.v1</identifier>
          <alternateIdentifiers>
            <alternateIdentifier alternateIdentifierType="URL">https://figshare.com/articles/dataset/Scrambled_text_training_Language_Models_to_correct_OCR_errors_using_synthetic_data/27108334</alternateIdentifier>
          </alternateIdentifiers>
          <relatedIdentifiers>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/49417669</relatedIdentifier>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/49417675</relatedIdentifier>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/49417678</relatedIdentifier>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/49433905</relatedIdentifier>
          </relatedIdentifiers>
          <creators>
            <creator>
              <creatorName>Bourne, Jonno</creatorName>
              <givenName>Jonno</givenName>
              <familyName>Bourne</familyName>
              <nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org">0000-0003-2616-3716</nameIdentifier>
            </creator>
          </creators>
          <titles>
            <title><![CDATA[Scrambled text: training Language Models to correct OCR errors using synthetic data]]></title>
          </titles>
          <subjects>
            <subject>Natural language processing</subject>
            <subject>Library studies</subject>
            <subject>Open access</subject>
            <subject>Digital history</subject>
            <subject>newspapers.</subject>
            <subject>OCR</subject>
            <subject>NLP</subject>
            <subject>synthetic data</subject>
          </subjects>
          <dates>
            <date dateType="Created">2024-09-27</date>
            <date dateType="Updated">2024-09-27</date>
          </dates>
          <resourceType resourceTypeGeneral="Dataset">Dataset</resourceType>
          <publicationYear>2024</publicationYear>
          <publisher>University College London</publisher>
          <rightsList>
            <rights rightsURI="https://opensource.org/licenses/MIT" rightsIdentifier="MIT"/>
            <rights rightsURI="http://purl.org/coar/access_right/c_abf2" rightsIdentifier="open access"/>
          </rightsList>
          <descriptions>
            <description descriptionType="Abstract"><![CDATA[<p dir="ltr">This data repository contains the key datasets required to reproduce the paper "Scrambled text: training Language Models to correct OCR errors using synthetic data".</p><p dir="ltr">In addition it contains the 10,000 synthetic 19th century articles generated using GPT4o. These articles are available both as a csv with the prompt parameters as columns as well as the articles as individual text files.</p><p dir="ltr">The files in the repository are as follows</p><ul><li><b>ncse_hf_dataset</b>: A huggingface dictionary dataset containing 91 articles from the Nineteenth Century Serials Edition (NCSE) with original OCR and the transcribed groundtruth. This dataset is used as the testset in the paper</li><li><b>synth_gt.zip</b>: A zip file containing 5 parquet files of training data from the 10,000 synthetic articles. The each parquet file is made up of observations of a fixed length of tokens, for a total of 2 Million tokens. The observation lengths are 200, 100, 50, 25, 10.</li><li><b>synthetic_articles.zip</b>: A zip file containing the csv of all the synthetic articles and the prompts used to generate them.</li><li><b>synthetic_articles_text.zip</b>: A zip file containing the text files of all the synthetic articles. The file names are the prompt parameters and the id reference from the synthetic article csv.</li></ul><p dir="ltr">The data in this repo is used by the code repositories associated with the project </p><ul><li>https://github.com/JonnoB/scrambledtext_analysis</li><li>https://github.com/JonnoB/training_lms_with_synthetic_data</li></ul><p dir="ltr"><br></p>]]></description>
          </descriptions>
        </resource>
      </metadata>
    </record>
  </GetRecord>
</OAI-PMH>
