[Haml] Add -# encoding: support.

This commit is contained in:
Nathan Weizenbaum 2010-05-29 16:51:36 -07:00
parent 919fcf0c8d
commit ebaf1baa42
6 changed files with 310 additions and 5 deletions

View File

@ -3,6 +3,20 @@
* Table of contents
{:toc}
## 3.0.7 (Unreleased)
### Encoding Support
Haml 3.0.7 adds support for Ruby-style `-# coding:` comments
for declaring the encoding of a template.
For details see {file:HAML_REFERENCE.md#encodings the reference}.
This also slightly changes the behavior of Haml when the
{file:HAML_REFERENCE.md#encoding-option `:encoding` option} is not set.
Rather than defaulting to `"utf-8"`,
it defaults to the encoding of the source document,
and only falls back to `"utf-8"` if this encoding is `"us-ascii"`.
## 3.0.6
[Tagged on GitHub](http://github.com/nex3/haml/commit/3.0.6).

View File

@ -197,7 +197,9 @@ Available options are:
Note that Haml **does not** automatically re-encode Ruby values;
any strings coming from outside the application should be converted
before being passed into the Haml template.
Defaults to `Encoding.default_internal` or, if that's not set, `"utf-8"`.
Defaults to `Encoding.default_internal`; if that's not set,
defaults to the encoding of the Haml template;
if that's `us-ascii`, defaults to `"utf-8"`.
<br/><br/> <!-- There's no better way to do a paragraph break in a dl in Maruku -->
Many Ruby database drivers are not yet Ruby 1.9 compatible;
in particular, they return strings marked as ASCII-encoded
@ -207,6 +209,25 @@ Available options are:
set `:encoding` to `"ascii-8bit"`, or try to get the authors of the database drivers
to make them Ruby 1.9 compatible.
### Encodings
When using Ruby 1.9 or later,
Haml supports the same sorts of encoding-declaration comments that Ruby does.
Although both Ruby and Haml support several different styles,
the easiest it just to add `-# coding: encoding-name`
at the beginning of the Haml template
(it must come before all other lines).
This will tell Haml that the template is encoded using the named encoding.
By default, the HTML generated by Haml has the same encoding as the Haml template.
However, if `Encoding.default_internal` is set, Haml will attempt to use that instead.
In addition, the [`:encoding` option](#encoding-option) can be used
to specify an output encoding manually.
Note that, like Ruby, Haml does not support templates encoded in UTF-16 or UTF-32,
since these encodings are not compatible with ASCII.
It is possible to use these as the output encoding, though.
## Plain Text
A substantial portion of any HTML document is its content,

View File

@ -7,7 +7,7 @@
### Encoding Support
Add support for `@charset` for declaring the encoding of a stylesheet.
Sass 3.0.7 adds support for `@charset` for declaring the encoding of a stylesheet.
For details see {file:SASS_REFERENCE.md#encodings the reference}.
### Bug Fixes

View File

@ -85,8 +85,15 @@ module Haml
:format => :xhtml,
:escape_html => false,
}
template = check_haml_encoding(template) do |msg, line|
raise Haml::Error.new(msg, line)
end
unless ruby1_8?
@options[:encoding] = Encoding.default_internal || "utf-8"
@options[:encoding] = Encoding.default_internal || template.encoding
@options[:encoding] = "utf-8" if @options[:encoding].name == "US-ASCII"
end
@options.merge! options.reject {|k, v| v.nil?}
@index = 0
@ -99,8 +106,6 @@ module Haml
@options[:encoding] = @options[:encoding].name
end
template = check_encoding(template) {|msg, line| raise Haml::Error.new(msg, line)}
# :eod is a special end-of-document marker
@template = (template.rstrip).split(/\r\n|\r|\n/) + [:eod, :eod]
@template_index = 0

View File

@ -2,6 +2,7 @@ require 'erb'
require 'set'
require 'enumerator'
require 'stringio'
require 'strscan'
require 'haml/root'
require 'haml/util/subset_map'
@ -434,6 +435,37 @@ MSG
return str
end
# Like {\#check\_encoding}, but also checks for a Ruby-style `-# coding:` comment
# at the beginning of the template and uses that encoding if it exists.
#
# The Sass encoding rules are simple.
# If a `-# coding:` comment exists,
# we assume that that's the original encoding of the document.
# Otherwise, we use whatever encoding Ruby has.
#
# Haml uses the same rules for parsing coding comments as Ruby.
# This means that it can understand Emacs-style comments
# (e.g. `-*- encoding: "utf-8" -*-`),
# and also that it cannot understand non-ASCII-compatible encodings
# such as `UTF-16` and `UTF-32`.
#
# @param str [String] The Haml template of which to check the encoding
# @yield [msg] A block in which an encoding error can be raised.
# Only yields if there is an encoding error
# @yieldparam msg [String] The error message to be raised
# @return [String] The original string encoded properly
# @raise [ArgumentError] if the document declares an unknown encoding
def check_haml_encoding(str, &block)
return check_encoding(str, &block) if ruby1_8?
bom, encoding = parse_haml_magic_comment(str)
if encoding; str.force_encoding(encoding)
elsif bom; str.force_encoding("UTF-8")
end
return check_encoding(str, &block)
end
# Like {\#check\_encoding}, but also checks for a `@charset` declaration
# at the beginning of the file and uses that encoding if it exists.
#
@ -695,5 +727,36 @@ METHOD
return lcs_backtrace(c, x, y, i, j-1, &block) if c[i][j-1] > c[i-1][j]
return lcs_backtrace(c, x, y, i-1, j, &block)
end
# Parses a magic comment at the beginning of a Haml file.
# The parsing rules are basically the same as Ruby's.
#
# @return [(Boolean, String or nil)]
# Whether the document begins with a UTF-8 BOM,
# and the declared encoding of the document (or nil if none is declared)
def parse_haml_magic_comment(str)
scanner = StringScanner.new(str.dup.force_encoding("BINARY"))
bom = scanner.scan(/\xEF\xBB\xBF/n)
return bom unless scanner.scan(/-\s*#\s*/n)
if coding = try_parse_haml_emacs_magic_comment(scanner)
return bom, coding
end
return bom unless scanner.scan(/.*?coding[=:]\s*([\w-]+)/in)
return bom, scanner[1]
end
def try_parse_haml_emacs_magic_comment(scanner)
pos = scanner.pos
return unless scanner.scan(/.*?-\*-\s*/n)
# From Ruby's parse.y
return unless scanner.scan(/([^\s'":;]+)\s*:\s*("(?:\\.|[^"])*"|[^"\s;]+?)[\s;]*-\*-/n)
name, val = scanner[1], scanner[2]
return unless name =~ /(en)?coding/in
val = $1 if val =~ /^"(.*)"$/n
return val
ensure
scanner.pos = pos
end
end
end

View File

@ -1604,6 +1604,202 @@ HAML
assert_equal(3, e.line)
assert_equal('Invalid UTF-16LE character "\xFE"', e.message)
end
def test_same_coding_comment_as_encoding
assert_renders_encoded(<<HTML, <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# coding: utf-8
%p bâr
%p föö
HAML
end
def test_different_coding_comment_than_encoding
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# coding: ibm866
%p bâr
%p föö
HAML
end
def test_different_coding_than_system
assert_renders_encoded(<<HTML.encode("IBM866"), <<HAML.encode("IBM866"))
<p>тАЬ</p>
HTML
%p тАЬ
HAML
end
def test_case_insensitive_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# CodINg: IbM866
%p bâr
%p föö
HAML
end
def test_whitespace_insensitive_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-#coding:ibm866
%p bâr
%p föö
HAML
end
def test_equals_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# CodINg= ibm866
%p bâr
%p föö
HAML
end
def test_prefixed_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# foo BAR FAOJcoding: ibm866
%p bâr
%p föö
HAML
end
def test_suffixed_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# coding: ibm866 ASFJ (&(&#!$
%p bâr
%p föö
HAML
end
def test_emacs_prefixed_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# -*- coding: ibm866
%p bâr
%p föö
HAML
end
def test_emacs_suffixed_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# coding: ibm866 -*- coding: blah
%p bâr
%p föö
HAML
end
def test_emacs_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# -*- coding: ibm866 -*-
%p bâr
%p föö
HAML
end
def test_emacs_encoding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# -*- encoding: ibm866 -*-
%p bâr
%p föö
HAML
end
def test_quoted_emacs_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# -*- coding: "ibm866" -*-
%p bâr
%p föö
HAML
end
def test_whitespace_insensitive_emacs_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-#-*-coding:ibm866-*-
%p bâr
%p föö
HAML
end
def test_whitespace_insensitive_emacs_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-#-*-coding:ibm866-*-
%p bâr
%p föö
HAML
end
def test_one_of_several_emacs_comments
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# -*- foo: bar; coding: ibm866; baz: bang -*-
%p bâr
%p föö
HAML
end
def test_prefixed_emacs_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# foo bar coding: baz -*- coding: ibm866 -*-
%p bâr
%p föö
HAML
end
def test_suffixed_emacs_coding_comment
assert_renders_encoded(<<HTML.force_encoding("IBM866"), <<HAML)
<p>bâr</p>
<p>föö</p>
HTML
-# -*- coding: ibm866 -*- foo bar coding: baz
%p bâr
%p föö
HAML
end
end
private
@ -1618,4 +1814,10 @@ HAML
<p>föö</p>
HTML
end
def assert_renders_encoded(html, haml)
result = render(haml)
assert_equal html.encoding, result.encoding
assert_equal html, result
end
end