import re
from dataclasses import dataclass
import openai

@dataclass
class TocResponse:
  toc: str
  prompt_tokens: int
  completion_tokens: int

def clean_raw_toc_ocr(raw_ocr: str, book_title: str, prev_toc: str = None) -> TocResponse:
  prompt = """
You are a librarian extracting table of contents data in a structured format. The format you will need to output is as follows:

```
* {label (optional)} | {title} | {page number}
```

### Important Instructions:
- Fix any typos or bad characters generated by the OCR.
- Change any text in ALL UPPERCASE to be in Normal Title Case.

### Examples:

Input:
```
                                   PAGE
PREFACE                             1x
PART 1: THIS VVORLD                  1
   Chapter I.
   Of the Nature of Flatland
                                      3
   Chapter IT.
   Of the Cl'mate and Houses in
       Hlatland  . . . dsa. .a;;; ... 5
PART 2: OTHER WORLDS
                                    42
```

Output:
```
* | Preface | ix
* Part 1 | This World | 1
    ** Chapter I | Of the Nature of Flatland | 3
    ** Chapter II | Of the Climate and Houses in Flatland | 5
* Part 2 | Other Worlds | 42
```


Input:
```
      CONTENTS
CHapter 1. ...... ... . . 1
Ch^pter 2            .. @ 25
Chaptr               .   38
Chaptr  4                 48
```
Output
```
* | Chapter 1 | 1
* | Chapter 2 | 25
* | Chapter 3 | 38
* | Chapter 4 | 48
```

You can nest when necessary:

Input:
```
A. Technology
     Computers     ...       1
          Hard-drives        2
          Software           8
    II. Machinery           1[
   III. Hardware            3/
B. Agriculture
```

```
* A | Technology |
    ** I | Computers | 1
        *** | Hard-drives | 2
        *** | Software | 8
    ** II | Machinery | 11
    ** III | Hardware | 37
* B | Agriculture |
```
"""

  if prev_toc:
    nline = '\n'
    message = f"""
The last table of contents was too long too fit in your context. Continue extracting the table of contents of "{book_title}".

Here is the end of what you last output. Do not output this again, though.

```
{nline.join(prev_toc.split(nline)[-10:])}
```

And here is the rest of the OCR text:
```
{raw_ocr}
```
""".strip()
  else:
    message = f"""
Extract the table of contents from this OCR text of "{book_title}":

```
{raw_ocr}
```
  """.strip()
  print('openai request', message[0:500] + '...')
  completion = openai.chat.completions.create(
    model="gpt-3.5-turbo-1106",
    messages=[
      {"role": "system", "content": prompt},
      {"role": "user", "content": message},
    ],
    # max_tokens=1024,
    n=1,
    stop=None,
    temperature=0.5,
  )

  def clean_line(line: str) -> str:
    pipes = len(line.split('|')) - 1
    m = re.search(r'^ *\*+', line)
    if not m:
      return line
    stars = m.group()
    rest = line[m.span()[1]:]
    if pipes == 0:
      return f'{stars} | {rest.strip()} |'
    elif pipes == 1:
      return f'{stars} | {rest.strip()}'
    elif pipes == 2:
      return line
    elif pipes == 3:
      return line.replace('|', ' ', 1)
    else:
      # Not good
      return line

  toc = completion.choices[0].message.content
  prompt_tokens = completion.usage.prompt_tokens
  completion_tokens = completion.usage.completion_tokens
  return TocResponse(
      '\n'.join(map(clean_line, toc.split('\n'))),
      prompt_tokens,
      completion_tokens)