Google

Friday, March 16, 2007

retrieve all image links from an HTML document

uses mshtml, ActiveX, COMObj, IdHTTP, idURI;

{ .... }

procedure GetImageLinks(AURL: string; AList: TStrings);
var
IDoc: IHTMLDocument2;
strHTML: string;
v: Variant;
x: Integer;
ovLinks: OleVariant;
DocURL: string;
URI: TidURI;
ImgURL: string;
idHTTP: TidHTTP;
begin
AList.Clear;
URI := TidURI.Create(AURL);
try
DocURL := 'http://' + URI.Host;
if URI.Path <> '/' then
DocURL := DocURL + URI.Path;
finally
URI.Free;
end;
Idoc := CreateComObject(Class_HTMLDocument) as IHTMLDocument2;
try
IDoc.designMode := 'on';
while IDoc.readyState <> 'complete' do
Application.ProcessMessages;
v := VarArrayCreate([0, 0], VarVariant);
idHTTP := TidHTTP.Create(nil);
try
strHTML := idHTTP.Get(AURL);
finally
idHTTP.Free;
end;
v[0] := strHTML;
IDoc.Write(PSafeArray(System.TVarData(v).VArray));
IDoc.designMode := 'off';
while IDoc.readyState <> 'complete' do
Application.ProcessMessages;
ovLinks := IDoc.all.tags('IMG');
if ovLinks.Length > 0 then
begin
for
x := 0 to ovLinks.Length - 1 do
begin
ImgURL := ovLinks.Item(x).src;
// The stuff below will probably need a little tweaking
// Deteriming and turning realtive URLs into absolute URLs
// is not that difficult but this is all I could come up with
// in such a short notice.
if (ImgURL[1] = '/') then
begin
// more than likely a relative URL so
// append the DocURL
ImgURL := DocURL + ImgUrl;
end
else
begin
if
(Copy(ImgURL, 1, 11) = 'about:blank') then
begin
ImgURL := DocURL + Copy(ImgUrl, 12, Length(ImgURL));
end;
end;
AList.Add(ImgURL);
end;
end;
finally
IDoc := nil;
end;
end;


// Beispiel:
// Example:
procedure TForm1.Button1Click(Sender: TObject);
begin
GetImageLinks('http://www.swissdelphicenter.ch', Memo1.Lines);
end;

No comments: