hpricot:Hpricot::Container::Trav#each_hyperlink_uri

なぜか Universal name がうまく使えないので小細工をして

require "hpricot"

module Hpricot
  module Container::Trav
    def each_hyperlink_attribute
      traverse_element(
          'a',
          'area',
          'link',
          'img',
          'object',
          'q',
          'blockquote',
          'ins',
          'del',
          'form',
          'input',
          'head',
          'base',
          'script') {|elem|
        case elem.name
        when %r{(?:base|a|area|link)\z}i
          attrs = ['href']
        when %r{(?:img)\z}i
          attrs = ['src', 'longdesc', 'usemap']
        when %r{(?:object)\z}i
          attrs = ['classid', 'codebase', 'data', 'usemap']
        when %r{(?:q|blockquote|ins|del)\z}i
          attrs = ['cite']
        when %r{(?:form)\z}i
          attrs = ['action']
        when %r{(?:input)\z}i
          attrs = ['src', 'usemap']
        when %r{(?:head)\z}i
          attrs = ['profile']
        when %r{(?:script)\z}i
          attrs = ['src', 'for']
        end
        attrs.each {|attr|
          if hyperlink = elem.get_attribute(attr)
            yield elem, attr, hyperlink
          end
        }
      }
    end
  end
end

doc = Hpricot('<a href="http://www.foo.org/">foo</a><p><a href="http://www.bar.org/">bar</a></p>')
doc.each_hyperlink_uri {|x,y| p [x, y]}

doc = Hpricot('<a href="foo.html">foo</a><p><a href="bar.html">bar</a></p>')
doc.each_hyperlink_uri("http://www.hoge.org/") {|x,y| p [x, y]}

で、

["http://www.foo.org/", #<URI::HTTP:0xb7c95f9c URL:http://www.foo.org/>]
["http://www.bar.org/", #<URI::HTTP:0xb7c95d08 URL:http://www.bar.org/>]
["foo.html", #<URI::HTTP:0xb7c951f0 URL:http://www.hoge.org/foo.html>]
["bar.html", #<URI::HTTP:0xb7c94e44 URL:http://www.hoge.org/bar.html>]