htmlParser for delphi

(******************************************************)
(* **工作室 *)
(* HTML解析单元库 *)
(* *)
(* DxHtmlParser Unit *)
(* *)
(* email:316454904@qq.com QQ:316454904 *)
(******************************************************)
unit MyHtmlParser;
interface
uses Windows,MSHTML,ActiveX,Forms,Variants, SysUtils, Classes;
type
TMyHtmlParser = class
private
Doc: IHTMLDocument2;
FHTML, FURL: string;
procedure SetHTML(const Value: string);
procedure SetURL(s: string);
public
Doc2:IHTMLDocument2;
FParserOK:boolean;
FTimeOut:integer;
constructor Create;
destructor Destroy;override;
property HTML: string read FHTML write SetHTML;
property URL: string read FURL write SetURL;
property TimeOut:integer read FTimeOut write FTimeOut default 20000;
property ParserOK:boolean read FParserOK default false;
end;
implementation
{ TDxHtmlParser }
procedure TMyHtmlParser.SetURL(s: string);
var
doc4:ihtmldocument4;
tick:integer;
begin
FURL:=s ;
if FURL<>'' then
begin
tick:=gettickcount;
doc.QueryInterface(IID_ihtmldocument4,doc4);
if assigned(doc4) then
begin
doc2:=doc4.createDocumentFromUrl(s,'null');
while (doc2.readyState<>'complete') and (gettickcount-tick<FTimeOut) do
begin
application.ProcessMessages;
sleep(10);
end;
if doc2.readyState='complete' then FParserOK:=true;
end;
end;
end;
constructor TMyHtmlParser.Create;
begin
CoInitialize(nil);
//创建IHTMLDocument2接口
FTimeOut:=20000;
CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, Doc);
Assert(Doc<>nil,'构建HTMLDocument接口失败');
Doc.Set_designMode('On'); //设置为设计模式,不执行脚本
while not (Doc.readyState = 'complete') do
begin
sleep(1);
Application.ProcessMessages;
end;
HTML:='<html></html>';
end;
destructor TMyHtmlParser.Destroy;
begin
CoUninitialize;
inherited;
end;
procedure TMyHtmlParser.SetHTML(const Value: string);
var
V: OLEVariant;
vDocument: OLEVariant;
vMimeType: OLEVariant;
vHtml: OLEVariant;
tick:integer;
begin
if FHTML <> Value then
begin
tick:=gettickcount;
FHTML := Value;
V := Doc;
vDocument := V.script.Document;
vMimeType := 'text/Html';
vHtml := FHtml;
vDocument.Open(vMimeType);
vDocument.Clear;
vDocument.Write(vHtml);
vDocument.Close;
while (doc.readyState<>'complete') and (gettickcount-tick<FTimeOut) do
begin
application.ProcessMessages;
sleep(10);
end;
if doc.readyState='complete' then
begin
FParserOK:=true;
doc2:=doc;
end;
end;
end;
end.

受到得闲老师的htmlparser启发,完善了一下,去掉的自认为没必要的东西(有了IhtmlDocument2,神马都是浮云),当然不是完全抄自得闲老师的解析器,本单元中的精华是SetHTML(const Value: string);和SetURL(s: string);这两个函数,其它的没什么意思。

SetHTML(const Value: string)是抄自TEmbeddedwb的IEParser。

SetURL(s: string);是根据MSDN上ihtmlDocument4.createDocumentFromUrl创建出新的ihtmlDocument2接口。

不解释了,代码就这点。

不足的地方:doc2会自动去下载图片,如有朋友修改后还请发我一份,谢谢!!