qsv/src/cmd/rename.rs at master · dathere/qsv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
static USAGE: &str = r#"
Rename the columns of a CSV efficiently. It has two modes of operation:

Positional mode (default):
The new column names are given as a comma-separated list of names.
The number of column names given MUST match the number of columns in the
CSV unless "_all_generic" is used.

Pairwise mode:
The new column names are given as a comma-separated list of pairs of old and new
column names. The format is "old1,new1,old2,new2,...".

Examples:
  Change the column names of a CSV with three columns:
  $ qsv rename id,name,title

  Rename only specific columns using pairs:
  $ qsv rename --pairwise oldname,newname,oldcol,newcol

  Replace the column names with generic ones (_col_N):
  $ qsv rename _all_generic

  Add generic column names to a CSV with no headers:
  $ qsv rename _all_generic --no-headers

  Use column names that contains commas and conflict with the separator:
  $ qsv rename '"Date - Opening","Date - Actual Closing"'

For more examples, see https://github.com/dathere/qsv/blob/master/tests/test_rename.rs.

Usage:
    qsv rename [options] [--] <headers> [<input>]
    qsv rename --help

rename arguments:
    <headers>              The new headers to use for the CSV.
                           Separate multiple headers with a comma.
                           If "_all_generic" is given, the headers will be renamed
                           to generic column names, where the column name uses
                           the format "_col_N" where N is the 1-based column index.
                           Alternatively, specify pairs of old,new column names
                           to rename only specific columns.
    --pairwise             Invoke pairwise renaming.

Common options:
    -h, --help             Display this message
    -o, --output <file>    Write output to <file> instead of stdout.
    -n, --no-headers       When set, the header will be inserted on top.
    -d, --delimiter <arg>  The field delimiter for reading CSV data.
                           Must be a single character. (default: ,)
"#;

use foldhash::{HashMap, HashMapExt};
use serde::Deserialize;

use crate::{
    CliResult,
    config::{Config, Delimiter},
    util,
};

#[derive(Deserialize)]
struct Args {
    arg_input:       Option<String>,
    arg_headers:     String,
    flag_output:     Option<String>,
    flag_no_headers: bool,
    flag_delimiter:  Option<Delimiter>,
    flag_pairwise:   bool,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
    let args: Args = util::get_args(USAGE, argv)?;

    let rconfig = Config::new(args.arg_input.as_ref())
        .delimiter(args.flag_delimiter)
        .no_headers_flag(args.flag_no_headers);

    let mut rdr = rconfig.reader()?;
    let mut wtr = Config::new(args.flag_output.as_ref()).writer()?;

    if args.flag_no_headers {
        // Input has no header row, so read the first record to determine column count
        let mut record = csv::ByteRecord::new();
        if !rdr.read_byte_record(&mut record)? {
            // No data
            return Ok(());
        }
        // Determine new headers
        let num_cols = record.len();
        let new_headers = if args.arg_headers.to_lowercase() == "_all_generic" {
            rename_headers_all_generic(num_cols)
        } else {
            args.arg_headers
        };
        let mut new_rdr = csv::Reader::from_reader(new_headers.as_bytes());
        let new_headers = new_rdr.byte_headers()?.clone();
        if new_headers.len() != num_cols {
            return fail_incorrectusage_clierror!(
                "The length of the CSV columns ({}) is different from the provided header ({}).",
                num_cols,
                new_headers.len()
            );
        }
        wtr.write_record(&new_headers)?;
        wtr.write_record(&record)?;
        while rdr.read_byte_record(&mut record)? {
            wtr.write_record(&record)?;
        }
    } else {
        // Input has a header row, so use the original logic
        let headers = rdr.byte_headers()?;
        let header_parts: Vec<&str> = args.arg_headers.split(',').collect();
        let is_pairs = header_parts.len().is_multiple_of(2)
            && header_parts.len() >= 2
            && header_parts.chunks(2).any(|chunk| {
                chunk.len() == 2
                    && headers
                        .iter()
                        .any(|h| std::str::from_utf8(h).unwrap_or("") == chunk[0])
            });
        let has_matching_old = header_parts.chunks(2).any(|chunk| {
            chunk.len() == 2
                && headers
                    .iter()
                    .any(|h| std::str::from_utf8(h).unwrap_or("") == chunk[0])
        });
        let new_headers = if args.arg_headers.to_lowercase() == "_all_generic" {
            let s = rename_headers_all_generic(headers.len());
            let mut new_rdr = csv::Reader::from_reader(s.as_bytes());
            new_rdr.byte_headers()?.clone()
        } else if is_pairs && has_matching_old && args.flag_pairwise {
            // Use pairwise renaming only when explicitly requested with --pairwise flag
            if let Ok(renamed_headers) = parse_rename_pairs(&args.arg_headers, headers) {
                renamed_headers
            } else {
                let mut new_rdr = csv::Reader::from_reader(args.arg_headers.as_bytes());
                new_rdr.byte_headers()?.clone()
            }
        } else {
            let mut new_rdr = csv::Reader::from_reader(args.arg_headers.as_bytes());
            let new_headers = new_rdr.byte_headers()?.clone();
            if new_headers.len() != headers.len() {
                return fail_incorrectusage_clierror!(
                    "The length of the CSV headers ({}) is different from the provided one ({}).",
                    headers.len(),
                    new_headers.len()
                );
            }
            new_headers
        };
        wtr.write_record(&new_headers)?;
        let mut record = csv::ByteRecord::new();
        while rdr.read_byte_record(&mut record)? {
            wtr.write_record(&record)?;
        }
    }
    Ok(wtr.flush()?)
}

fn parse_rename_pairs(
    pairs_str: &str,
    original_headers: &csv::ByteRecord,
) -> CliResult<csv::ByteRecord> {
    let pairs: Vec<&str> = pairs_str.split(',').collect();
    if !pairs.len().is_multiple_of(2) {
        return fail_incorrectusage_clierror!(
            "Invalid number of arguments for pair-based renaming. Expected even number of values, \
             got {}.",
            pairs.len()
        );
    }

    // Create a mapping from old names to new names
    let mut rename_map = HashMap::new();
    for chunk in pairs.chunks(2) {
        if chunk.len() == 2 {
            // this assert is really just for the compiler to skip bounds checking below
            // per clippy::missing_asserts_for_indexing
            assert!(chunk.len() > 1);
            rename_map.insert(chunk[0], chunk[1]);
        }
    }

    // Create new headers by applying the rename map
    let mut new_headers = csv::ByteRecord::new();
    for header in original_headers {
        let header_str =
            std::str::from_utf8(header).map_err(|_| "Invalid UTF-8 in header".to_string())?;

        if let Some(&new_name) = rename_map.get(header_str) {
            new_headers.push_field(new_name.as_bytes());
        } else {
            new_headers.push_field(header);
        }
    }

    Ok(new_headers)
}

pub fn rename_headers_all_generic(num_of_cols: usize) -> String {
    use std::fmt::Write;

    // we pre-allocate a string with a capacity of 7 characters per column name
    // this is a rough estimate, and should be more than enough
    let mut result = String::with_capacity(num_of_cols * 7);
    for i in 1..=num_of_cols {
        if i > 1 {
            result.push(',');
        }
        // safety: safe to unwrap as we're just using it to append to result string
        write!(result, "_col_{i}").unwrap();
    }
    result
}