diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..5f35aad --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,42 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Pure Go library for parsing Excel 97-2004 (.xls) files in BIFF5/BIFF8 format. Fork of `github.com/extrame/xls`. Does **not** handle .xlsx files. + +## Build & Test + +No `go.mod` — pre-modules project. Requires `GO111MODULE=on`: + +```bash +GO111MODULE=on go test -v -race ./... # run all tests +GO111MODULE=on go test -v -run TestBig # run single test +``` + +No Makefile, no linter config. + +## Architecture + +``` +WorkBook → WorkSheet → Row → Col (cell) +``` + +- **Entry points**: `Open()`, `OpenWithCloser()`, `OpenReader()` in `xls.go` +- **WorkBook** (`workbook.go`): parses BIFF record stream, manages sheets/fonts/formats/SST +- **WorkSheet** (`worksheet.go`): lazy-loaded on `GetSheet(i)`, contains rows by index +- **Row** (`row.go`): `Col(i)` (merged-cell aware) vs `ColExact(i)` (exact match) +- **Cell types** (`col.go`): `contentHandler` interface — implementations: `NumberCol`, `LabelsstCol`, `labelCol`, `BlankCol`, `RkCol`, `MulrkCol`, `FormulaCol`, `HyperLink`, etc. +- **Binary records** (`bof.go`): all data as `bof` structs (ID uint16 + Size uint16) +- **Date handling** (`date.go`): Excel serial dates → `time.Time`, supports 1900 & 1904 systems +- **Formatting** (`xf.go`, `font.go`, `format.go`): XF records map cells to fonts/formats + +Key dependency: `github.com/extrame/ole2` for OLE2 container parsing. + +## Parsing Flow + +1. `Open()` → OLE2 parse → create WorkBook +2. `Parse()` → read BIFF record stream → extract sheets, SST, XF, fonts, formats +3. `GetSheet(n)` → lazy-parse worksheet records into rows/cells +4. Cell access → format values via XF index lookup diff --git a/bof.go b/bof.go index ecb1f1e..33957b2 100644 --- a/bof.go +++ b/bof.go @@ -14,6 +14,9 @@ type bof struct { //read the utf16 string from reader func (b *bof) utf16String(buf io.ReadSeeker, count uint32) string { + if count == 0 { + return "" + } var bts = make([]uint16, count) binary.Read(buf, binary.LittleEndian, &bts) runes := utf16.Decode(bts[:len(bts)-1]) diff --git a/cell_range.go b/cell_range.go index 2dde04e..8fab5a0 100644 --- a/cell_range.go +++ b/cell_range.go @@ -14,7 +14,7 @@ type Ranger interface { type CellRange struct { FirstRowB uint16 LastRowB uint16 - FristColB uint16 + FirstColB uint16 LastColB uint16 } @@ -27,7 +27,7 @@ func (c *CellRange) LastRow() uint16 { } func (c *CellRange) FirstCol() uint16 { - return c.FristColB + return c.FirstColB } func (c *CellRange) LastCol() uint16 { @@ -48,7 +48,7 @@ type HyperLink struct { //get the hyperlink string, use the public variable Url to get the original Url func (h *HyperLink) String(wb *WorkBook) []string { - res := make([]string, h.LastColB-h.FristColB+1) + res := make([]string, h.LastColB-h.FirstColB+1) var str string if h.IsUrl { str = fmt.Sprintf("%s(%s)", h.Description, h.Url) @@ -56,7 +56,7 @@ func (h *HyperLink) String(wb *WorkBook) []string { str = h.ExtendedFilePath } - for i := uint16(0); i < h.LastColB-h.FristColB+1; i++ { + for i := uint16(0); i < h.LastColB-h.FirstColB+1; i++ { res[i] = str } return res diff --git a/col.go b/col.go index 8636566..39435f2 100644 --- a/col.go +++ b/col.go @@ -168,9 +168,30 @@ type NumberCol struct { } func (c *NumberCol) String(wb *WorkBook) []string { - if fNo := wb.Xfs[c.Index].formatNo(); fNo != 0 { - t := timeFromExcelTime(c.Float, wb.dateMode == 1) - return []string{yymmdd.Format(t, wb.Formats[fNo].str)} + idx := int(c.Index) + if idx < len(wb.Xfs) { + fNo := wb.Xfs[idx].formatNo() + if fNo >= 164 { // user defined format + if formatter := wb.Formats[fNo]; formatter != nil { + formatterLower := strings.ToLower(formatter.str) + if formatterLower == "general" || + strings.Contains(formatter.str, "#") || + strings.Contains(formatter.str, ".00") || + strings.Contains(formatterLower, "m/y") || + strings.Contains(formatterLower, "d/y") || + strings.Contains(formatterLower, "m.y") || + strings.Contains(formatterLower, "d.y") || + strings.Contains(formatterLower, "h:") || + strings.Contains(formatterLower, "д.г") { + return []string{strconv.FormatFloat(c.Float, 'f', -1, 64)} + } + t := timeFromExcelTime(c.Float, wb.dateMode == 1) + return []string{yymmdd.Format(t, formatter.str)} + } + } else if 14 <= fNo && fNo <= 17 || fNo == 22 || 27 <= fNo && fNo <= 36 || 50 <= fNo && fNo <= 58 { // built-in date format + t := timeFromExcelTime(c.Float, wb.dateMode == 1) + return []string{t.Format(time.RFC3339)} + } } return []string{strconv.FormatFloat(c.Float, 'f', -1, 64)} } @@ -218,7 +239,11 @@ type LabelsstCol struct { } func (c *LabelsstCol) String(wb *WorkBook) []string { - return []string{wb.sst[int(c.Sst)]} + idx := int(c.Sst) + if idx < len(wb.sst) { + return []string{wb.sst[idx]} + } + return []string{""} } type labelCol struct { diff --git a/comparexlsxlsx.go b/comparexlsxlsx.go index a161abd..8b62482 100644 --- a/comparexlsxlsx.go +++ b/comparexlsxlsx.go @@ -26,6 +26,9 @@ func CompareXlsXlsx(xlsfilepathname string, xlsxfilepathname string) string { } for row, xlsxRow := range xlsxSheet.Rows { xlsRow := xlsSheet.Row(row) + if xlsRow == nil { + continue + } for cell, xlsxCell := range xlsxRow.Cells { xlsxText := xlsxCell.String() xlsText := xlsRow.Col(cell) diff --git a/example_test.go b/example_test.go index e62fc5b..70431e0 100644 --- a/example_test.go +++ b/example_test.go @@ -10,8 +10,9 @@ func ExampleOpen() { } } -func ExampleWorkBook_NumberSheets() { - if xlFile, err := Open("Table.xls", "utf-8"); err == nil { +func ExampleWorkBook_NumSheets() { + if xlFile, closer, err := OpenWithCloser("Table.xls", "utf-8"); err == nil { + defer closer.Close() for i := 0; i < xlFile.NumSheets(); i++ { sheet := xlFile.GetSheet(i) fmt.Println(sheet.Name) @@ -21,13 +22,17 @@ func ExampleWorkBook_NumberSheets() { //Output: read the content of first two cols in each row func ExampleWorkBook_GetSheet() { - if xlFile, err := Open("Table.xls", "utf-8"); err == nil { + if xlFile, closer, err := OpenWithCloser("Table.xls", "utf-8"); err == nil { + defer closer.Close() if sheet1 := xlFile.GetSheet(0); sheet1 != nil { fmt.Print("Total Lines ", sheet1.MaxRow, sheet1.Name) col1 := sheet1.Row(0).Col(0) col2 := sheet1.Row(0).Col(0) for i := 0; i <= (int(sheet1.MaxRow)); i++ { row1 := sheet1.Row(i) + if row1 == nil { + continue + } col1 = row1.Col(0) col2 = row1.Col(1) fmt.Print("\n", col1, ",", col2) diff --git a/row.go b/row.go index 0908172..5615ac9 100644 --- a/row.go +++ b/row.go @@ -23,12 +23,19 @@ func (r *Row) Col(i int) string { serial := uint16(i) if ch, ok := r.cols[serial]; ok { strs := ch.String(r.wb) - return strs[0] + if len(strs) > 0 { + return strs[0] + } + return "" } else { for _, v := range r.cols { if v.FirstCol() <= serial && v.LastCol() >= serial { strs := v.String(r.wb) - return strs[serial-v.FirstCol()] + idx := int(serial - v.FirstCol()) + if idx < len(strs) { + return strs[idx] + } + return "" } } } @@ -41,7 +48,10 @@ func (r *Row) ColExact(i int) string { serial := uint16(i) if ch, ok := r.cols[serial]; ok { strs := ch.String(r.wb) - return strs[0] + if len(strs) > 0 { + return strs[0] + } + return "" } return "" } diff --git a/sst_test.go b/sst_test.go new file mode 100644 index 0000000..8ef5eef --- /dev/null +++ b/sst_test.go @@ -0,0 +1,140 @@ +package xls + +import ( + "bytes" + "encoding/binary" + "testing" +) + +func writeBIFFRecord(buf *bytes.Buffer, id uint16, data []byte) { + binary.Write(buf, binary.LittleEndian, id) + binary.Write(buf, binary.LittleEndian, uint16(len(data))) + buf.Write(data) +} + +func makeBOFData() []byte { + var b bytes.Buffer + binary.Write(&b, binary.LittleEndian, uint16(0x0600)) // Ver: BIFF8 + binary.Write(&b, binary.LittleEndian, uint16(0x0005)) // Type: workbook globals + b.Write(make([]byte, 12)) // Id_make, Year, Flags, Min_ver + return b.Bytes() +} + +// TestSSTContinueRichtext tests that richtext formatting runs spanning +// a record boundary don't corrupt subsequent SST entries. +// String 0: "ABC" with 2 formatting runs — chars fit in SST, runs overflow to CONTINUE. +// String 1: "DEF" in CONTINUE after the formatting runs. +func TestSSTContinueRichtext(t *testing.T) { + // SST record: SstInfo + string 0 header + "ABC" (no room for formatting runs) + var sstData bytes.Buffer + binary.Write(&sstData, binary.LittleEndian, uint32(2)) // Total + binary.Write(&sstData, binary.LittleEndian, uint32(2)) // Count + binary.Write(&sstData, binary.LittleEndian, uint16(3)) // string 0 char count + sstData.WriteByte(0x08) // flag: has richtext + binary.Write(&sstData, binary.LittleEndian, uint16(2)) // richtext_num = 2 runs + sstData.Write([]byte("ABC")) // char data (Latin1) + // 0 bytes left for 8-byte formatting runs → overflow to CONTINUE + + // CONTINUE record: formatting runs + string 1 + var contData bytes.Buffer + contData.Write(make([]byte, 8)) // 2 runs × 4 bytes (dummy) + binary.Write(&contData, binary.LittleEndian, uint16(3)) // string 1 char count + contData.WriteByte(0x00) // flag: plain Latin1 + contData.Write([]byte("DEF")) + + var stream bytes.Buffer + writeBIFFRecord(&stream, 0x0809, makeBOFData()) + writeBIFFRecord(&stream, 0x00FC, sstData.Bytes()) + writeBIFFRecord(&stream, 0x003C, contData.Bytes()) + + wb := &WorkBook{Formats: make(map[uint16]*Format)} + wb.Parse(bytes.NewReader(stream.Bytes())) + + if len(wb.sst) != 2 { + t.Fatalf("expected 2 SST entries, got %d", len(wb.sst)) + } + if wb.sst[0] != "ABC" { + t.Errorf("sst[0]: expected %q, got %q", "ABC", wb.sst[0]) + } + if wb.sst[1] != "DEF" { + t.Errorf("sst[1]: expected %q, got %q", "DEF", wb.sst[1]) + } +} + +// TestSSTContinueRichtextPartial tests the case where richtext formatting +// runs are partially read in the SST record (boundary falls mid-formatting). +func TestSSTContinueRichtextPartial(t *testing.T) { + // SST record: SstInfo + string 0 header + "ABC" + 4 of 8 formatting bytes + var sstData bytes.Buffer + binary.Write(&sstData, binary.LittleEndian, uint32(2)) // Total + binary.Write(&sstData, binary.LittleEndian, uint32(2)) // Count + binary.Write(&sstData, binary.LittleEndian, uint16(3)) // string 0 char count + sstData.WriteByte(0x08) // flag: has richtext + binary.Write(&sstData, binary.LittleEndian, uint16(2)) // richtext_num = 2 + sstData.Write([]byte("ABC")) // chars + sstData.Write(make([]byte, 4)) // partial: 4 of 8 formatting bytes + + // CONTINUE: remaining 4 formatting bytes + string 1 + var contData bytes.Buffer + contData.Write(make([]byte, 4)) // remaining formatting bytes + binary.Write(&contData, binary.LittleEndian, uint16(3)) // string 1 char count + contData.WriteByte(0x00) // flag + contData.Write([]byte("DEF")) + + var stream bytes.Buffer + writeBIFFRecord(&stream, 0x0809, makeBOFData()) + writeBIFFRecord(&stream, 0x00FC, sstData.Bytes()) + writeBIFFRecord(&stream, 0x003C, contData.Bytes()) + + wb := &WorkBook{Formats: make(map[uint16]*Format)} + wb.Parse(bytes.NewReader(stream.Bytes())) + + if len(wb.sst) != 2 { + t.Fatalf("expected 2 SST entries, got %d", len(wb.sst)) + } + if wb.sst[0] != "ABC" { + t.Errorf("sst[0]: expected %q, got %q", "ABC", wb.sst[0]) + } + if wb.sst[1] != "DEF" { + t.Errorf("sst[1]: expected %q, got %q", "DEF", wb.sst[1]) + } +} + +// TestSSTContinuePhonetic tests that phonetic data spanning a record +// boundary is properly skipped in the CONTINUE handler. +func TestSSTContinuePhonetic(t *testing.T) { + // SST record: SstInfo + string 0 with phonetic flag, chars fit but phonetic overflows + var sstData bytes.Buffer + binary.Write(&sstData, binary.LittleEndian, uint32(2)) // Total + binary.Write(&sstData, binary.LittleEndian, uint32(2)) // Count + binary.Write(&sstData, binary.LittleEndian, uint16(3)) // string 0 char count + sstData.WriteByte(0x04) // flag: has phonetic + binary.Write(&sstData, binary.LittleEndian, uint32(12)) // phonetic_size = 12 bytes + sstData.Write([]byte("ABC")) // chars + // 0 bytes for 12-byte phonetic data → overflow + + // CONTINUE: phonetic data + string 1 + var contData bytes.Buffer + contData.Write(make([]byte, 12)) // phonetic data (dummy) + binary.Write(&contData, binary.LittleEndian, uint16(3)) // string 1 char count + contData.WriteByte(0x00) // flag + contData.Write([]byte("DEF")) + + var stream bytes.Buffer + writeBIFFRecord(&stream, 0x0809, makeBOFData()) + writeBIFFRecord(&stream, 0x00FC, sstData.Bytes()) + writeBIFFRecord(&stream, 0x003C, contData.Bytes()) + + wb := &WorkBook{Formats: make(map[uint16]*Format)} + wb.Parse(bytes.NewReader(stream.Bytes())) + + if len(wb.sst) != 2 { + t.Fatalf("expected 2 SST entries, got %d", len(wb.sst)) + } + if wb.sst[0] != "ABC" { + t.Errorf("sst[0]: expected %q, got %q", "ABC", wb.sst[0]) + } + if wb.sst[1] != "DEF" { + t.Errorf("sst[1]: expected %q, got %q", "DEF", wb.sst[1]) + } +} diff --git a/workbook.go b/workbook.go index f917d53..d7ca478 100644 --- a/workbook.go +++ b/workbook.go @@ -5,7 +5,6 @@ import ( "encoding/binary" "golang.org/x/text/encoding/charmap" "io" - "os" "unicode/utf16" ) @@ -22,10 +21,12 @@ type WorkBook struct { Author string rs io.ReadSeeker sst []string - continue_utf16 uint16 - continue_rich uint16 - continue_apsb uint32 - dateMode uint16 + continue_utf16 uint16 + continue_rich uint16 + continue_apsb uint32 + continue_rich_pending uint32 // remaining richtext bytes to skip in CONTINUE + continue_apsb_pending uint32 // remaining phonetic bytes to skip in CONTINUE + dateMode uint16 } //read workbook from ole2 file @@ -64,7 +65,7 @@ func (w *WorkBook) addFont(font *FontInfo, buf io.ReadSeeker) { func (w *WorkBook) addFormat(format *Format) { if w.Formats == nil { - os.Exit(1) + w.Formats = make(map[uint16]*Format) } w.Formats[format.Head.Index] = format } @@ -72,8 +73,11 @@ func (w *WorkBook) addFormat(format *Format) { func (wb *WorkBook) parseBof(buf io.ReadSeeker, b *bof, pre *bof, offset_pre int) (after *bof, after_using *bof, offset int) { after = b after_using = pre + offset = offset_pre var bts = make([]byte, b.Size) - binary.Read(buf, binary.LittleEndian, bts) + if err := binary.Read(buf, binary.LittleEndian, bts); err != nil { + return + } buf_item := bytes.NewReader(bts) switch b.Id { case 0x809: @@ -87,6 +91,38 @@ func (wb *WorkBook) parseBof(buf io.ReadSeeker, b *bof, pre *bof, offset_pre int binary.Read(buf_item, binary.LittleEndian, &wb.Codepage) case 0x3c: // CONTINUE if pre.Id == 0xfc { + // Skip pending richtext/phonetic formatting data that overflowed + // from the previous record when characters were already complete. + if wb.continue_utf16 == 0 && (wb.continue_rich_pending > 0 || wb.continue_apsb_pending > 0) { + if wb.continue_rich_pending > 0 { + available := int64(buf_item.Len()) + skip := int64(wb.continue_rich_pending) + if skip <= available { + buf_item.Seek(skip, io.SeekCurrent) + wb.continue_rich_pending = 0 + } else { + buf_item.Seek(0, io.SeekEnd) + wb.continue_rich_pending -= uint32(available) + } + } + if wb.continue_apsb_pending > 0 { + available := int64(buf_item.Len()) + skip := int64(wb.continue_apsb_pending) + if skip <= available { + buf_item.Seek(skip, io.SeekCurrent) + wb.continue_apsb_pending = 0 + } else { + buf_item.Seek(0, io.SeekEnd) + wb.continue_apsb_pending -= uint32(available) + } + } + if wb.continue_rich_pending == 0 && wb.continue_apsb_pending == 0 { + wb.continue_rich = 0 + wb.continue_apsb = 0 + offset_pre++ + } + } + var size uint16 var err error if wb.continue_utf16 >= 1 { @@ -226,27 +262,27 @@ func (w *WorkBook) get_string(buf io.ReadSeeker, size uint16) (res string, err e res = string(runes) } if richtext_num > 0 { - var bts []byte var seek_size int64 if w.Is5ver { seek_size = int64(2 * richtext_num) } else { seek_size = int64(4 * richtext_num) } - bts = make([]byte, seek_size) - err = binary.Read(buf, binary.LittleEndian, bts) - if err == io.EOF { + bts := make([]byte, seek_size) + n, readErr := io.ReadFull(buf, bts) + if readErr != nil { w.continue_rich = richtext_num + w.continue_rich_pending = uint32(seek_size) - uint32(n) + err = io.EOF } - - // err = binary.Read(buf, binary.LittleEndian, bts) } if phonetic_size > 0 { - var bts []byte - bts = make([]byte, phonetic_size) - err = binary.Read(buf, binary.LittleEndian, bts) - if err == io.EOF { + bts := make([]byte, phonetic_size) + n, readErr := io.ReadFull(buf, bts) + if readErr != nil { w.continue_apsb = phonetic_size + w.continue_apsb_pending = phonetic_size - uint32(n) + err = io.EOF } } } @@ -298,6 +334,9 @@ func (w *WorkBook) ReadAllCells(max int) (res [][]string) { } temp := make([][]string, leng) for k, row := range sheet.rows { + if row == nil { + continue + } data := make([]string, 0) if len(row.cols) > 0 { for _, col := range row.cols { @@ -307,7 +346,9 @@ func (w *WorkBook) ReadAllCells(max int) (res [][]string) { str := col.String(w) for i := uint16(0); i < col.LastCol()-col.FirstCol()+1; i++ { - data[col.FirstCol()+i] = str[i] + if int(i) < len(str) { + data[col.FirstCol()+i] = str[i] + } } } if leng > int(k) { diff --git a/worksheet.go b/worksheet.go index 0f4ec8b..9d1ebaf 100644 --- a/worksheet.go +++ b/worksheet.go @@ -3,7 +3,6 @@ package xls import ( "bytes" "encoding/binary" - "fmt" "io" "unicode/utf16" ) @@ -57,7 +56,6 @@ func (w *WorkSheet) parse(buf io.ReadSeeker) { break } } else { - fmt.Println(err) break } } @@ -67,7 +65,9 @@ func (w *WorkSheet) parse(buf io.ReadSeeker) { func (w *WorkSheet) parseBof(buf io.ReadSeeker, b *bof, pre *bof, col_pre interface{}) (*bof, interface{}) { var col interface{} var bts = make([]byte, b.Size) - binary.Read(buf, binary.LittleEndian, bts) + if err := binary.Read(buf, binary.LittleEndian, bts); err != nil { + return b, col + } buf = bytes.NewReader(bts) switch b.Id { // case 0x0E5: //MERGEDCELLS @@ -86,6 +86,9 @@ func (w *WorkSheet) parseBof(buf io.ReadSeeker, b *bof, pre *bof, col_pre interf w.addRow(r) case 0x0BD: //MULRK mc := new(MulrkCol) + if b.Size < 6 { + break + } size := (b.Size - 6) / 6 binary.Read(buf, binary.LittleEndian, &mc.Col) mc.Xfrks = make([]XfRk, size) @@ -96,6 +99,9 @@ func (w *WorkSheet) parseBof(buf io.ReadSeeker, b *bof, pre *bof, col_pre interf col = mc case 0x0BE: //MULBLANK mc := new(MulBlankCol) + if b.Size < 6 { + break + } size := (b.Size - 6) / 2 binary.Read(buf, binary.LittleEndian, &mc.Col) mc.Xfs = make([]uint16, size) @@ -110,6 +116,10 @@ func (w *WorkSheet) parseBof(buf io.ReadSeeker, b *bof, pre *bof, col_pre interf case 0x06: //FORMULA c := new(FormulaCol) binary.Read(buf, binary.LittleEndian, &c.Header) + if b.Size < 20 { + col = c + break + } c.Bts = make([]byte, b.Size-20) binary.Read(buf, binary.LittleEndian, &c.Bts) col = c @@ -182,10 +192,12 @@ func (w *WorkSheet) parseBof(buf io.ReadSeeker, b *bof, pre *bof, col_pre interf } if flag&0x8 != 0 { binary.Read(buf, binary.LittleEndian, &count) - var bts = make([]uint16, count) - binary.Read(buf, binary.LittleEndian, &bts) - runes := utf16.Decode(bts[:len(bts)-1]) - hy.TextMark = string(runes) + if count > 0 { + var bts = make([]uint16, count) + binary.Read(buf, binary.LittleEndian, &bts) + runes := utf16.Decode(bts[:len(bts)-1]) + hy.TextMark = string(runes) + } } w.addRange(&hy.CellRange, &hy)