diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index 486a25fa775c9..3e235796af424 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -460,11 +460,12 @@ impl CodeMap { for mbc in multibyte_chars.get().iter() { debug!("codemap: {:?}-byte char at {:?}", mbc.bytes, mbc.pos); if mbc.pos < bpos { - total_extra_bytes += mbc.bytes; + // every character is at least one byte, so we only + // count the actual extra bytes. + total_extra_bytes += mbc.bytes - 1; // We should never see a byte position in the middle of a // character - assert!(bpos == mbc.pos || - bpos.to_uint() >= mbc.pos.to_uint() + mbc.bytes); + assert!(bpos.to_uint() >= mbc.pos.to_uint() + mbc.bytes); } else { break; } diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs index b711e95bc943b..5bace75a5eace 100644 --- a/src/libsyntax/parse/lexer.rs +++ b/src/libsyntax/parse/lexer.rs @@ -264,8 +264,7 @@ pub fn bump(rdr: &StringReader) { } if byte_offset_diff > 1 { - rdr.filemap.record_multibyte_char( - Pos::from_uint(current_byte_offset), byte_offset_diff); + rdr.filemap.record_multibyte_char(rdr.last_pos.get(), byte_offset_diff); } } else { rdr.curr.set(None); diff --git a/src/test/run-make/unicode-input/Makefile b/src/test/run-make/unicode-input/Makefile new file mode 100644 index 0000000000000..2d6ecd3c55efc --- /dev/null +++ b/src/test/run-make/unicode-input/Makefile @@ -0,0 +1,11 @@ +-include ../tools.mk + +all: + # check that we don't ICE on unicode input, issue #11178 + $(RUSTC) multiple_files.rs + $(call RUN,multiple_files) "$(RUSTC)" "$(TMPDIR)" + + # check that our multibyte-ident spans are (approximately) the + # correct length. issue #8706 + $(RUSTC) span_length.rs + $(call RUN,span_length) "$(RUSTC)" "$(TMPDIR)" diff --git a/src/test/run-make/unicode-input/multiple_files.rs b/src/test/run-make/unicode-input/multiple_files.rs new file mode 100644 index 0000000000000..68bec1d215a27 --- /dev/null +++ b/src/test/run-make/unicode-input/multiple_files.rs @@ -0,0 +1,65 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::{char, os, run, str}; +use std::rand::{task_rng, Rng}; +use std::io::File; + +// creates unicode_input_multiple_files_{main,chars}.rs, where the +// former imports the latter. `_chars` just contains an indentifier +// made up of random characters, because will emit an error message +// about the ident being in the wrong place, with a span (and creating +// this span used to upset the compiler). + +fn random_char() -> char { + let mut rng = task_rng(); + // a subset of the XID_start unicode table (ensuring that the + // compiler doesn't fail with an "unrecognised token" error) + let (lo, hi): (u32, u32) = match rng.gen_range(1, 4 + 1) { + 1 => (0x41, 0x5a), + 2 => (0xf8, 0x1ba), + 3 => (0x1401, 0x166c), + _ => (0x10400, 0x1044f) + }; + + char::from_u32(rng.gen_range(lo, hi + 1)).unwrap() +} + +fn main() { + let args = os::args(); + let rustc = args[1].as_slice(); + let tmpdir = Path::new(args[2].as_slice()); + + let main_file = tmpdir.join("unicode_input_multiple_files_main.rs"); + let main_file_str = main_file.as_str().unwrap(); + { + let _ = File::create(&main_file).unwrap() + .write_str("mod unicode_input_multiple_files_chars;"); + } + + for _ in range(0, 100) { + { + let randoms = tmpdir.join("unicode_input_multiple_files_chars.rs"); + let mut w = File::create(&randoms).unwrap(); + for _ in range(0, 30) { + let _ = w.write_char(random_char()); + } + } + + // rustc is passed to us with --out-dir and -L etc., so we + // can't exec it directly + let result = run::process_output("sh", [~"-c", rustc + " " + main_file_str]).unwrap(); + let err = str::from_utf8_lossy(result.error); + + // positive test so that this test will be updated when the + // compiler changes. + assert!(err.as_slice().contains("expected item but found")) + } +} diff --git a/src/test/run-make/unicode-input/span_length.rs b/src/test/run-make/unicode-input/span_length.rs new file mode 100644 index 0000000000000..c437b70baf3fc --- /dev/null +++ b/src/test/run-make/unicode-input/span_length.rs @@ -0,0 +1,62 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::{char, os, run, str}; +use std::rand::{task_rng, Rng}; +use std::io::File; + +// creates a file with `fn main() { }` and checks the +// compiler emits a span of the appropriate length (for the +// "unresolved name" message); currently just using the number of code +// points, but should be the number of graphemes (FIXME #7043) + +fn random_char() -> char { + let mut rng = task_rng(); + // a subset of the XID_start unicode table (ensuring that the + // compiler doesn't fail with an "unrecognised token" error) + let (lo, hi): (u32, u32) = match rng.gen_range(1, 4 + 1) { + 1 => (0x41, 0x5a), + 2 => (0xf8, 0x1ba), + 3 => (0x1401, 0x166c), + _ => (0x10400, 0x1044f) + }; + + char::from_u32(rng.gen_range(lo, hi + 1)).unwrap() +} + +fn main() { + let args = os::args(); + let rustc = args[1].as_slice(); + let tmpdir = Path::new(args[2].as_slice()); + + let main_file = tmpdir.join("span_main.rs"); + let main_file_str = main_file.as_str().unwrap(); + + for _ in range(0, 100) { + let n = task_rng().gen_range(3u, 20); + + { + let _ = write!(&mut File::create(&main_file).unwrap(), + r"\#[feature(non_ascii_idents)]; fn main() \{ {} \}", + // random string of length n + range(0, n).map(|_| random_char()).collect::<~str>()); + } + + // rustc is passed to us with --out-dir and -L etc., so we + // can't exec it directly + let result = run::process_output("sh", [~"-c", rustc + " " + main_file_str]).unwrap(); + + let err = str::from_utf8_lossy(result.error); + + // the span should end the line (e.g no extra ~'s) + let expected_span = "^" + "~".repeat(n - 1) + "\n"; + assert!(err.as_slice().contains(expected_span)); + } +}