$KCODE = 'u' require 'rexml/streamlistener' require 'rubygems' require 'active_record' ################################################# ## Rewite the following to your configurations ################################################# ActiveRecord::Base.establish_connection( :adapter => 'postgresql', :host => 'localhost', :username => 'postgres', :password => '********', :database => 'wikipedia_ja' ) ################################################ ## Create Database ## (ex. PostgreSQL) ## $ createdb -U postgres wikipedia_ja ## ## Create Table ## (ex. PostgreSQL table scheme) ## wikipedia_ja=# \d documents ## Table "public.documents" ## Column | Type | Modifiers ## --------------+-----------------------------+-------------------------------------------------------- ## id | integer | not null default nextval('documents_id_seq'::regclass) ## url | character varying(1024) | ## title | character varying(255) | ## author | character varying(255) | ## plain_text | text | ## wiki_text | text | ## created_time | timestamp without time zone | ## updated_on | timestamp without time zone | ## entity_id | integer | ## Indexes: ## "documents_pkey" PRIMARY KEY, btree (id) ## "documents_entity_id_index" btree (entity_id) ## ## (ex. SQL for PostgreSQL) ## CREATE TABLE documents ( ## "id" serial primary key, ## "url" character varying(1024) DEFAULT NULL, ## "title" character varying(255) DEFAULT NULL, ## "author" character varying(255) DEFAULT NULL, ## "plain_text" text DEFAULT NULL, ## "wiki_text" text DEFAULT NULL, ## "created_time" timestamp DEFAULT NULL, ## "updated_on" timestamp DEFAULT NULL, ## "entity_id" integer DEFAULT NULL ## ); ## CREATE INDEX "index_documents_on_entity_id" ON documents ("entity_id"); ################################################ class Document < ActiveRecord::Base def save begin super rescue ActiveRecord::StatementInvalid return false end end class << self def import_from(filepath) Document.import(filepath) end def import(filepath = nil) filepath = "./jawiki-latest-pages-articles.xml" unless filepath list = WikipediaListener.new source = File.new(filepath) REXML::Document.parse_stream(source, list) end end end class WikipediaListener require 'cgi' require 'hpricot' include REXML::StreamListener BASEURL = "http://ja.wikipedia.org/wiki/" TEXTMINSIZE = 256 def initialize @document = nil @buf = nil end def tag_start(name, attrs) if name == "page" @document = nil @buf = StringIO.new end return nil if skip? @buf.write(%|<#{REXML::Text::normalize(name)}|) attrs.each do |pair| @buf.write(%| #{REXML::Text::normalize(pair[0])}="#{REXML::Text::normalize(pair[1])}"|) end @buf.write(">") end def text(text) return nil if skip? @buf.write(REXML::Text::normalize(text)) end def tag_end(name) return nil if skip? @buf.write(%||) if name == "page" begin proc_document(Hpricot.XML(@buf.string)) rescue => e puts e exit end @buf.close end end private def proc_document(doc) title = doc.at(:title).inner_text return if !title || title == "" return if title =~ /(Media|特別|Wikipedia|利用者|ノート|画像|Template|Category|Portal|MediaWiki)(:|;|-|=|‐)/ id = doc.at(:id).inner_text @document = Document.find_or_initialize_by_entity_id(id) return unless @document.new_record? @document.created_time = Time.parse(doc.at(:timestamp).inner_text) @document.title = title if author = doc.at(:username) || doc.at(:ip) @document.author = author.inner_text else @document.author = "anonymous" end text = doc.at(:text).inner_text @document.plain_text = trimming(REXML::Text::unnormalize(text)) @document.wiki_text = REXML::Text::unnormalize(text).gsub(/[  \t\r\f]/, " ") @document.url = BASEURL + CGI.escape(@document.title.gsub(/ /, "_")) p @document.title @document.save end def trimming(text) return nil if self.blank? || (text.size < TEXTMINSIZE) || (text =~ /^#REDIRECT/) text.gsub!(/^=+([^=]+)=+/){ $1 } text.gsub!(/<[^>]+>/, "") text.gsub!(/^\s*[\*#:|;-]+\s*/, "") text.gsub!(/\[\[[^\]\|]+\|([^\]]+)\]\]/){ $1 } text.gsub!(/\[\[([a-zA-Z-]+:)?([^\]]+)\]\]/){ $2 } text.gsub!(/\{\{([^\}\|]+)\|[^\}]+\}\}/){ $1 } text.gsub!(/\{\{([^\}]+)\}\}/){ $1 } text.gsub!(/\[http:[^ \]]+ ([^\]]+)\]/){ $1 } text.gsub!(/'{2,}/, "") text.gsub!(/^ *\{?|/, "") text.gsub!(/^ *[\!\|\}]/, "") text.gsub!(/^\*+/, "") text.gsub!(/[a-zA-Z]+=\"[^\"].*\"/, "") text.gsub!(/[a-z][a-z]+=[0-9]+/, "") text.gsub!(/.*border-style.*/, "") text.gsub!(/.*valign=.*/, "") text.gsub!(/\&[a-zA-Z]+;/, "") text.gsub!(/.*(利用者|会話|ノート):.*/, "") text.gsub!(/(Wikipedia|Category):/, "") text.gsub!(/.*語:/, "") text.gsub!(/^thumb\|/, "") text.gsub!(/画像:/, "") text.gsub!(/^[ +]*[\|]*/, "") text.gsub!(/\|\|/, " ") text.gsub!(/\s/, " ") return text end def skip? return true unless @buf return true if @buf.closed? end end module ActsAsFindOrInitializeBy def self.included(base) base.extend ClassMethods end module ClassMethods def method_missing(method_id, *arguments) if match = /find_or_initialize_by_([_a-zA-Z]\w*)/.match(method_id.to_s) attribute_names = extract_attribute_names_from_match(match) super unless all_attributes_exists?(attribute_names) find(:first, :conditions => construct_conditions_from_arguments(attribute_names, arguments)) || new(construct_attributes_from_arguments(attribute_names, arguments)) else super end end end module InstanceMethods def self.included(base) base.extend SingletonMethods end module SingletonMethods end end end ActiveRecord::Base.send :include, ActsAsFindOrInitializeBy if $0 == __FILE__ if ARGV.size > 1 puts "Usage: #{$0} WIKIPEDIA_XML_FILE_PATH" exit end begin if filepath = ARGV.shift raise "File not found(#{filepath})" unless FileTest.exist?(filepath) Document.import_from(filepath) else raise "Usage: #{$0} WIKIPEDIA_XML_FILE_PATH\n or: File not found(./jawiki-latest-pages-articles.xml)\n see also: #{$0} header comments" unless FileTest.exist?('./jawiki-latest-pages-articles.xml') Document.import end rescue => e puts e exit end end